github.com/hiyorimi/geobed@v0.0.0-20190227204948-42ebdc6a8871/geobed.go (about)

     1  package geobed
     2  
     3  import (
     4  	"archive/zip"
     5  	"bufio"
     6  	"bytes"
     7  	"compress/gzip"
     8  	"encoding/gob"
     9  	"io"
    10  	"log"
    11  	"net/http"
    12  	"os"
    13  	"regexp"
    14  	"sort"
    15  	"strconv"
    16  	"strings"
    17  
    18  	geohash "github.com/mmcloughlin/geohash"
    19  )
    20  
    21  // There are over 2.4 million cities in the world. The Geonames data set only
    22  // contains 143,270 and the MaxMind set contains 567,382 and 3,173,959 in the
    23  //  other MaxMind set.
    24  // Obviously there's a lot of overlap and the worldcitiespop.txt from MaxMind
    25  // contains a lot of dupes, though it by far the most comprehensive in terms
    26  //  of city - lat/lng.
    27  // It may not be possible to have information for all cities, but many of the
    28  // cities are also fairly remote and likely don't have internet access anyway.
    29  // The Geonames data is preferred because it contains additional information
    30  // such as elevation, population, and more. Population is good particuarly nice
    31  //  because a sense for
    32  // the city size can be understood by applications. So showing all major
    33  // cities is pretty easy. Though the primary goal of this package is to
    34  //  geocode, the additional information
    35  // is bonus. So after checking the Geonames set, the geocoding functions
    36  // will then look at MaxMind's.
    37  // Maybe in the future this package will even use the Geonames premium data
    38  // and have functions to look up nearest airports, etc.
    39  // I would simply use just Geonames data, but there's so many more cities
    40  // in the MaxMind set despite the lack of additional details.
    41  //
    42  // http://download.geonames.org/export/dump/cities1000.zip
    43  // http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip
    44  // http://download.maxmind.com/download/worldcities/worldcitiespop.txt.gz
    45  
    46  // A list of data sources.
    47  var dataSetFiles = []map[string]string{
    48  	{"url": "http://download.geonames.org/export/dump/cities1000.zip",
    49  		"path": "./geobed-data/cities1000.zip", "id": "geonamesCities1000"},
    50  	{"url": "http://download.geonames.org/export/dump/countryInfo.txt",
    51  		"path": "./geobed-data/countryInfo.txt", "id": "geonamesCountryInfo"},
    52  	{"url": "https://github.com/CODAIT/redrock/raw/master/twitter-decahose/src/main/resources/Location/worldcitiespop.txt.gz",
    53  		"path": "./geobed-data/worldcitiespop.txt.gz", "id": "maxmindWorldCities"},
    54  	//{"url": "http://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip",
    55  	//"path": "./geobed-data/GeoLiteCity-latest.zip", "id": "maxmindLiteCity"},
    56  }
    57  
    58  // UsSateCodes is a handy map of US state codes to full names.
    59  var UsSateCodes = map[string]string{
    60  	"AL": "Alabama",
    61  	"AK": "Alaska",
    62  	"AZ": "Arizona",
    63  	"AR": "Arkansas",
    64  	"CA": "California",
    65  	"CO": "Colorado",
    66  	"CT": "Connecticut",
    67  	"DE": "Delaware",
    68  	"FL": "Florida",
    69  	"GA": "Georgia",
    70  	"HI": "Hawaii",
    71  	"ID": "Idaho",
    72  	"IL": "Illinois",
    73  	"IN": "Indiana",
    74  	"IA": "Iowa",
    75  	"KS": "Kansas",
    76  	"KY": "Kentucky",
    77  	"LA": "Louisiana",
    78  	"ME": "Maine",
    79  	"MD": "Maryland",
    80  	"MA": "Massachusetts",
    81  	"MI": "Michigan",
    82  	"MN": "Minnesota",
    83  	"MS": "Mississippi",
    84  	"MO": "Missouri",
    85  	"MT": "Montana",
    86  	"NE": "Nebraska",
    87  	"NV": "Nevada",
    88  	"NH": "New Hampshire",
    89  	"NJ": "New Jersey",
    90  	"NM": "New Mexico",
    91  	"NY": "New York",
    92  	"NC": "North Carolina",
    93  	"ND": "North Dakota",
    94  	"OH": "Ohio",
    95  	"OK": "Oklahoma",
    96  	"OR": "Oregon",
    97  	"PA": "Pennsylvania",
    98  	"RI": "Rhode Island",
    99  	"SC": "South Carolina",
   100  	"SD": "South Dakota",
   101  	"TN": "Tennessee",
   102  	"TX": "Texas",
   103  	"UT": "Utah",
   104  	"VT": "Vermont",
   105  	"VA": "Virginia",
   106  	"WA": "Washington",
   107  	"WV": "West Virginia",
   108  	"WI": "Wisconsin",
   109  	"WY": "Wyoming",
   110  	// Territories
   111  	"AS": "American Samoa",
   112  	"DC": "District of Columbia",
   113  	"FM": "Federated States of Micronesia",
   114  	"GU": "Guam",
   115  	"MH": "Marshall Islands",
   116  	"MP": "Northern Mariana Islands",
   117  	"PW": "Palau",
   118  	"PR": "Puerto Rico",
   119  	"VI": "Virgin Islands",
   120  	// Armed Forces (AE includes Europe, Africa, Canada, and the Middle East)
   121  	"AA": "Armed Forces Americas",
   122  	"AE": "Armed Forces Europe",
   123  	"AP": "Armed Forces Pacific",
   124  }
   125  
   126  // GeoBed contains all of the city and country data. Cities are split into buckets by
   127  // country to increase lookup speed when the country is known.
   128  type GeoBed struct {
   129  	c  Cities
   130  	co []CountryInfo
   131  }
   132  
   133  // Cities is a type alias to hold slice of GeobedCity.
   134  type Cities []GeobedCity
   135  
   136  func (c Cities) Len() int {
   137  	return len(c)
   138  }
   139  func (c Cities) Swap(i, j int) {
   140  	c[i], c[j] = c[j], c[i]
   141  }
   142  func (c Cities) Less(i, j int) bool {
   143  	return toLower(c[i].City) < toLower(c[j].City)
   144  }
   145  
   146  // GeobedCity is combined city struct (the various data sets have different
   147  // fields, this combines what's available and keeps things smaller).
   148  type GeobedCity struct {
   149  	City    string
   150  	CityAlt string
   151  	// TODO: Think about converting this to a small int to save on memory
   152  	// allocation. Lookup requests can have the strings converted to the
   153  	// same int if there are any matches.
   154  	// This could make lookup more accurate, easier, and faster even. IF
   155  	// the int uses less bytes than the two letter code string.
   156  	Country    string
   157  	Region     string
   158  	Latitude   float64
   159  	Longitude  float64
   160  	Population int32
   161  	Geohash    string
   162  }
   163  
   164  // TODO: String interning? (much like converting country code to int)
   165  // https://gist.github.com/karlseguin/6570372
   166  
   167  // TODO: Store the cities in mmap...???
   168  // https://github.com/boltdb/bolt/blob/master/bolt_unix.go#L42-L69
   169  // Maybe even use bolt?
   170  
   171  var maxMindCityDedupeIdx map[string][]string
   172  
   173  // Holds information about the index ranges for city names (1st and 2nd
   174  // characters) to help narrow down sets of the GeobedCity slice to scan
   175  // when looking for a match.
   176  var cityNameIdx map[string]int
   177  var locationDedupeIdx map[string]bool
   178  
   179  // CountryInfo contains information about each country from Geonames
   180  // including; ISO codes, FIPS, country capital, area (sq km), population, etc.
   181  // Particularly useful for validating a location string contains a country
   182  // name which can help the search process.
   183  // Adding to this info, a slice of partial geohashes to help narrow down
   184  // reverse geocoding lookups (maps to country buckets).
   185  type CountryInfo struct {
   186  	Country            string
   187  	Capital            string
   188  	Area               int32
   189  	Population         int32
   190  	GeonameID          int32
   191  	ISONumeric         int16
   192  	ISO                string
   193  	ISO3               string
   194  	Fips               string
   195  	Continent          string
   196  	Tld                string
   197  	CurrencyCode       string
   198  	CurrencyName       string
   199  	Phone              string
   200  	PostalCodeFormat   string
   201  	PostalCodeRegex    string
   202  	Languages          string
   203  	Neighbours         string
   204  	EquivalentFipsCode string
   205  }
   206  
   207  // GeocodeOptions contains options for geocoding.
   208  // For now just an exact match on city name, but
   209  // there will be potentially other options that can be set to adjust how
   210  // searching/matching works.
   211  type GeocodeOptions struct {
   212  	ExactCity bool
   213  }
   214  
   215  // An index range struct that's used for narrowing down ranges over the
   216  // large Cities struct.
   217  type r struct {
   218  	f int
   219  	t int
   220  }
   221  
   222  // NewGeobed creates a new Geobed instance. You do not need more than one.
   223  // Should be a singleton. There's a fair bit of data to load into memory.
   224  func NewGeobed() GeoBed {
   225  	g := GeoBed{}
   226  
   227  	var err error
   228  	g.c, err = loadGeobedCityData()
   229  	if err != nil {
   230  		log.Println("Got err", err, "loading GeobedCity Data into memory.")
   231  	}
   232  	g.co, err = loadGeobedCountryData()
   233  	if err != nil {
   234  		log.Println("Got err", err, "loading GeobedCountry Data into memory.")
   235  	}
   236  	err = loadGeobedCityNameIdx()
   237  	if err != nil || len(g.c) == 0 {
   238  		log.Println("Got err", err, "loading data into memory. Will try to download now.")
   239  		g.downloadDataSets()
   240  		g.loadDataSets()
   241  		g.store()
   242  	}
   243  
   244  	return g
   245  }
   246  
   247  // Downloads the data sets if needed.
   248  func (g *GeoBed) downloadDataSets() {
   249  	os.Mkdir("./geobed-data", 0777)
   250  	for _, f := range dataSetFiles {
   251  		_, err := os.Stat(f["path"])
   252  		if err != nil {
   253  			if os.IsNotExist(err) {
   254  				// log.Println(f["path"] + " does not exist, downloading...")
   255  				out, oErr := os.Create(f["path"])
   256  				defer out.Close()
   257  				if oErr == nil {
   258  					r, rErr := http.Get(f["url"])
   259  					if r.StatusCode == 404 {
   260  						log.Println("Got 404 downloading ", f["url"], "file. Try placing it in", f["path"],
   261  							"manually.")
   262  						return
   263  					}
   264  					if rErr == nil {
   265  						_, nErr := io.Copy(out, r.Body)
   266  						if nErr != nil {
   267  							log.Println("Failed to copy data file, it will be tried again" +
   268  								"on next application start.")
   269  							// remove file so another attempt can be made, should something fail
   270  							err = os.Remove(f["path"])
   271  							if err != nil {
   272  								log.Println("Encountered an error on file deletion:", err)
   273  							}
   274  						}
   275  						r.Body.Close()
   276  					}
   277  					defer r.Body.Close()
   278  					out.Close()
   279  				} else {
   280  					log.Println(oErr)
   281  				}
   282  			}
   283  		}
   284  	}
   285  }
   286  
   287  func (g *GeoBed) createCityNamesLocationsIndex() {
   288  	// Index the locations of city names in the g.c []GeoCity slice. This way when
   289  	// searching the range can be limited so it will be faster.
   290  	cityNameIdx = make(map[string]int)
   291  	for k, v := range g.c {
   292  		// Get the index key for the first character of the city name.
   293  		ik := toLower(string(v.City[0]))
   294  		if val, ok := cityNameIdx[ik]; ok {
   295  			// If this key number is greater than what was previously recorded,
   296  			// then set it as the new indexed key.
   297  			if val < k {
   298  				cityNameIdx[ik] = k
   299  			}
   300  		} else {
   301  			// If the index key has not yet been set for this value, then set it.
   302  			cityNameIdx[ik] = k
   303  		}
   304  
   305  		// Get the index key for the first two characters of the city name.
   306  		// if len(v.CityLower) >= 2 {
   307  		// 	ik2 := v.CityLower[0:2]
   308  		// 	if val, ok := cityNameIdx[ik2]; ok {
   309  		// 		// If this key number is greater than what was previously
   310  		//		// recorded, then set it as the new indexed key.
   311  		// 		if val < k {
   312  		// 			cityNameIdx[ik2] = k
   313  		// 		}
   314  		// 	} else {
   315  		// 		// If the index key has not yet been set for this value, then set it.
   316  		// 		cityNameIdx[ik2] = k
   317  		// 	}
   318  		// }
   319  	}
   320  }
   321  
   322  func (g *GeoBed) loadGeonamesCities1000(f map[string]string) {
   323  	rz, err := zip.OpenReader(f["path"])
   324  	if err != nil {
   325  		log.Fatal(err)
   326  	}
   327  	defer rz.Close()
   328  
   329  	for _, uF := range rz.File {
   330  		fi, err := uF.Open()
   331  
   332  		if err != nil {
   333  			log.Fatal(err)
   334  		}
   335  		defer fi.Close()
   336  
   337  		// Geonames uses a tab delineated format and it's not even
   338  		// consistent. No CSV reader that I've found for Go can understand this.
   339  		// I'm not expecting any reader to either because it's an
   340  		// invalid CSV to be frank. However, we can still split up each row by \t
   341  		scanner := bufio.NewScanner(fi)
   342  		scanner.Split(bufio.ScanLines)
   343  
   344  		i := 1
   345  		for scanner.Scan() {
   346  			i++
   347  
   348  			// So regexp, sadly, must be used (well, unless I wanted parse
   349  			//  each string byte by byte, pushing each into a buffer to
   350  			// append to a slice until a tab is reached, etc.).
   351  			// But I'd have to also then put in a condition if the next
   352  			// byte was a \t rune, then append an empty string, etc. This
   353  			//  just, for now, seems nicer (easier).
   354  			// This is only an import/update, so it shouldn't be an issue
   355  			//  for performance. If it is, then I'll look into other solutions.
   356  			fields := regexp.MustCompile("\t").Split(scanner.Text(), 19)
   357  
   358  			// NOTE: Now using a combined GeobedCity struct since not all
   359  			// data sets have the same fields.
   360  			// Plus, the entire point was to geocode forward and reverse.
   361  			//  Bonus information like elevation and such is just superfluous.
   362  			// Leaving it here because it may be configurable... If options
   363  			//  are passed to NewGeobed() then maybe Geobed can simply be a Geonames search.
   364  			// Don't even load in MaxMind data...And if that's the case,
   365  			//  maybe that bonus information is desired.
   366  			if len(fields) == 19 {
   367  				//id, _ := strconv.Atoi(fields[0])
   368  				lat, _ := strconv.ParseFloat(fields[4], 64)
   369  				lng, _ := strconv.ParseFloat(fields[5], 64)
   370  				pop, _ := strconv.Atoi(fields[14])
   371  				//elv, _ := strconv.Atoi(fields[15])
   372  				//dem, _ := strconv.Atoi(fields[16])
   373  
   374  				gh := geohash.Encode(lat, lng)
   375  				// This is produced with empty lat/lng values - don't store it.
   376  				if gh == "7zzzzzzzzzzz" {
   377  					gh = ""
   378  				}
   379  
   380  				var c GeobedCity
   381  				c.City = strings.Trim(string(fields[1]), " ")
   382  				c.CityAlt = string(fields[3])
   383  				c.Country = string(fields[8])
   384  				c.Region = string(fields[10])
   385  				c.Latitude = lat
   386  				c.Longitude = lng
   387  				c.Population = int32(pop)
   388  				c.Geohash = gh
   389  
   390  				// Don't include entries without a city name. If we want to
   391  				// geocode the centers of countries and states, then we can
   392  				// do that faster through other means.
   393  				if len(c.City) > 0 {
   394  					g.c = append(g.c, c)
   395  				}
   396  			}
   397  		}
   398  	}
   399  }
   400  
   401  func (g *GeoBed) loadMaxmindWorldCities(f map[string]string) {
   402  	// It also has a lot of dupes
   403  	maxMindCityDedupeIdx = make(map[string][]string)
   404  
   405  	fi, err := os.Open(f["path"])
   406  	if err != nil {
   407  		log.Println(err)
   408  	}
   409  	defer fi.Close()
   410  
   411  	fz, err := gzip.NewReader(fi)
   412  	if err != nil {
   413  		log.Println(err)
   414  	}
   415  	defer fz.Close()
   416  
   417  	scanner := bufio.NewScanner(fz)
   418  	scanner.Split(bufio.ScanLines)
   419  
   420  	i := 1
   421  	for scanner.Scan() {
   422  		i++
   423  		t := scanner.Text()
   424  
   425  		fields := strings.Split(t, ",")
   426  		if len(fields) == 7 {
   427  			var b bytes.Buffer
   428  			b.WriteString(fields[0]) // country
   429  			b.WriteString(fields[3]) // region
   430  			b.WriteString(fields[1]) // city
   431  
   432  			idx := b.String()
   433  			b.Reset()
   434  			maxMindCityDedupeIdx[idx] = fields
   435  		}
   436  	}
   437  
   438  	// Loop the map of fields after dupes have been removed (about 1/5th
   439  	// less... 2.6m vs 3.1m inreases lookup performance).
   440  	for _, fields := range maxMindCityDedupeIdx {
   441  		if fields[0] != "" && fields[0] != "0" {
   442  			if fields[2] != "AccentCity" {
   443  				pop, _ := strconv.Atoi(fields[4])
   444  				lat, _ := strconv.ParseFloat(fields[5], 64)
   445  				lng, _ := strconv.ParseFloat(fields[6], 64)
   446  				// MaxMind's data set is a bit dirty. I've seen city names
   447  				// surrounded by parenthesis in a few places.
   448  				cn := strings.Trim(string(fields[2]), " ")
   449  				cn = strings.Trim(cn, "( )")
   450  
   451  				// Don't take any city names with erroneous punctuation either.
   452  				if strings.Contains(cn, "!") || strings.Contains(cn, "@") {
   453  					continue
   454  				}
   455  
   456  				gh := geohash.Encode(lat, lng)
   457  				// This is produced with empty lat/lng values - don't store it.
   458  				if gh == "7zzzzzzzzzzz" {
   459  					gh = ""
   460  				}
   461  
   462  				// If the geohash was seen before...
   463  				_, ok := locationDedupeIdx[gh]
   464  				if !ok {
   465  					locationDedupeIdx[gh] = true
   466  
   467  					var c GeobedCity
   468  					c.City = cn
   469  					c.Country = toUpper(string(fields[0]))
   470  					c.Region = string(fields[3])
   471  					c.Latitude = lat
   472  					c.Longitude = lng
   473  					c.Population = int32(pop)
   474  					c.Geohash = gh
   475  
   476  					// Don't include entries without a city name. If we want
   477  					// to geocode the centers of countries and states, then
   478  					// we can do that faster through other means.
   479  					if len(c.City) > 0 && len(c.Country) > 0 {
   480  						g.c = append(g.c, c)
   481  					}
   482  				}
   483  			}
   484  		}
   485  	}
   486  	// Clear out the temrporary index (set to nil, it does get re-created)
   487  	// so that Go can garbage collect it at some point whenever it feels the need.
   488  	maxMindCityDedupeIdx = nil
   489  	locationDedupeIdx = nil
   490  }
   491  
   492  func (g *GeoBed) loadGeonamesCountryInfo(f map[string]string) {
   493  	fi, err := os.Open(f["path"])
   494  
   495  	if err != nil {
   496  		log.Fatal(err)
   497  	}
   498  	defer fi.Close()
   499  
   500  	scanner := bufio.NewScanner(fi)
   501  	scanner.Split(bufio.ScanLines)
   502  
   503  	i := 1
   504  	for scanner.Scan() {
   505  		t := scanner.Text()
   506  		// There are a bunch of lines in this file that are comments, they start with #
   507  		if string(t[0]) != "#" {
   508  			i++
   509  			fields := regexp.MustCompile("\t").Split(t, 19)
   510  
   511  			if len(fields) == 19 {
   512  				if fields[0] != "" && fields[0] != "0" {
   513  					isoNumeric, _ := strconv.Atoi(fields[2])
   514  					area, _ := strconv.Atoi(fields[6])
   515  					pop, _ := strconv.Atoi(fields[7])
   516  					gid, _ := strconv.Atoi(fields[16])
   517  
   518  					var ci CountryInfo
   519  					ci.ISO = string(fields[0])
   520  					ci.ISO3 = string(fields[1])
   521  					ci.ISONumeric = int16(isoNumeric)
   522  					ci.Fips = string(fields[3])
   523  					ci.Country = string(fields[4])
   524  					ci.Capital = string(fields[5])
   525  					ci.Area = int32(area)
   526  					ci.Population = int32(pop)
   527  					ci.Continent = string(fields[8])
   528  					ci.Tld = string(fields[9])
   529  					ci.CurrencyCode = string(fields[10])
   530  					ci.CurrencyName = string(fields[11])
   531  					ci.Phone = string(fields[12])
   532  					ci.PostalCodeFormat = string(fields[13])
   533  					ci.PostalCodeRegex = string(fields[14])
   534  					ci.Languages = string(fields[15])
   535  					ci.GeonameID = int32(gid)
   536  					ci.Neighbours = string(fields[17])
   537  					ci.EquivalentFipsCode = string(fields[18])
   538  
   539  					g.co = append(g.co, ci)
   540  				}
   541  			}
   542  		}
   543  	}
   544  }
   545  
   546  // Unzips the data sets and loads the data.
   547  func (g *GeoBed) loadDataSets() {
   548  	locationDedupeIdx = make(map[string]bool)
   549  
   550  	for _, f := range dataSetFiles {
   551  		// This one is zipped
   552  		if f["id"] == "geonamesCities1000" {
   553  			g.loadGeonamesCities1000(f)
   554  		}
   555  
   556  		// ...And this one is Gzipped (and this one may have worked with the CSV
   557  		// package, but parse it the same way as the others line by line)
   558  		if f["id"] == "maxmindWorldCities" {
   559  			g.loadMaxmindWorldCities(f)
   560  		}
   561  
   562  		// ...And this one is just plain text
   563  		if f["id"] == "geonamesCountryInfo" {
   564  			g.loadGeonamesCountryInfo(f)
   565  		}
   566  	}
   567  
   568  	// Sort []GeobedCity by city names to help with binary search (the City field is the
   569  	// most searched upon field and the matching names can be easily filtered down from there).
   570  	sort.Sort(g.c)
   571  
   572  	//debug
   573  	//log.Println("TOTAL RECORDS:")
   574  	//log.Println(len(g.c))
   575  
   576  	g.createCityNamesLocationsIndex()
   577  }
   578  
   579  // Geocode forward geocode, location string to lat/lng (returns a struct though).
   580  // Calls exactMatchCity / fuzzyMatchLocation to perform a search.
   581  func (g *GeoBed) Geocode(n string, opts ...GeocodeOptions) GeobedCity {
   582  	var c GeobedCity
   583  	n = strings.TrimSpace(n)
   584  	if n == "" {
   585  		return c
   586  	}
   587  	// variadic optional argument trick
   588  	options := GeocodeOptions{}
   589  	if len(opts) > 0 {
   590  		options = opts[0]
   591  	}
   592  
   593  	if options.ExactCity {
   594  		c = g.exactMatchCity(n)
   595  	} else {
   596  		// NOTE: The downside of this (currently) is that something is basically
   597  		// always returned. It's a best guess.
   598  		// There's not much chance of it returning "not found" (or an empty
   599  		// GeobedCity struct).
   600  		// If you'd rather have nothing returned if not found, look at
   601  		// more exact matching options.
   602  		c = g.fuzzyMatchLocation(n)
   603  	}
   604  
   605  	return c
   606  }
   607  
   608  func filterMatchingCities(nCo, nSt string, matchingCities []GeobedCity) GeobedCity {
   609  	var c GeobedCity
   610  	// Then range over those matching cities and try to figure out which
   611  	// one it is - city names are unfortunately not unique of course.
   612  	// There shouldn't be very many so I don't mind the multiple loops.
   613  	for _, city := range matchingCities {
   614  		// Was the state abbreviation present? That sounds promising.
   615  		if strings.EqualFold(nSt, city.Region) {
   616  			c = city
   617  		}
   618  	}
   619  
   620  	for _, city := range matchingCities {
   621  		// Matches the state and country? Likely the best scenario,
   622  		// I'd call it the best match.
   623  		if strings.EqualFold(nSt, city.Region) && strings.EqualFold(nCo, city.Country) {
   624  			c = city
   625  		}
   626  	}
   627  
   628  	// If we still don't have a city, maybe we have a country with the
   629  	// city name, ie. "New York, USA"
   630  	// This is tougher because there's a "New York" in Florida, Kentucky,
   631  	// and more. Let's use population to assist if we can.
   632  	if c.City == "" {
   633  		matchingCountryCities := []GeobedCity{}
   634  		for _, city := range matchingCities {
   635  			if strings.EqualFold(nCo, city.Country) {
   636  				matchingCountryCities = append(matchingCountryCities, city)
   637  			}
   638  		}
   639  
   640  		// If someone says, "New York, USA" they most likely mean
   641  		// New York, NY because it's the largest city.
   642  		// Specific locations are often implied based on size or
   643  		// popularity even though the names aren't unique.
   644  		biggestCity := GeobedCity{}
   645  		for _, city := range matchingCountryCities {
   646  			if city.Population > biggestCity.Population {
   647  				biggestCity = city
   648  			}
   649  		}
   650  		c = biggestCity
   651  	}
   652  	return c
   653  }
   654  
   655  // Returns a GeobedCity only if there is an exact city name match. A stricter
   656  // match, though if state or country are missing a guess will be made.
   657  func (g *GeoBed) exactMatchCity(n string) GeobedCity {
   658  	var c GeobedCity
   659  	// Ignore the `abbrevSlice` value for now. Use `nCo` and `nSt` for more accuracy.
   660  	nCo, nSt, _, nSlice := g.extractLocationPieces(n)
   661  	nWithoutAbbrev := strings.Join(nSlice, " ")
   662  	ranges := g.getSearchRange(nSlice)
   663  
   664  	matchingCities := []GeobedCity{}
   665  
   666  	// First, get everything that matches the city exactly (case insensitive).
   667  	for _, rng := range ranges {
   668  		// When adjusting the range, the keys become out of sync. Start from rng.f
   669  		currentKey := rng.f
   670  		for _, v := range g.c[rng.f:rng.t] {
   671  			currentKey++
   672  			// The full string (ie. "New York" or "Las Vegas")
   673  			if strings.EqualFold(n, v.City) {
   674  				matchingCities = append(matchingCities, v)
   675  			}
   676  			// The pieces with abbreviations removed
   677  			if strings.EqualFold(nWithoutAbbrev, v.City) {
   678  				matchingCities = append(matchingCities, v)
   679  			}
   680  			// Each piece - doesn't make sense for now. May revisit this.
   681  			// ie. "New York" or "New" and "York" ... well, "York" is going
   682  			// to match a different city.
   683  			// While that might be weeded out next, who knows. It's starting
   684  			// to get more fuzzy than I'd like for this function.
   685  			// for _, np := range nSlice {
   686  			// 	if strings.EqualFold(np, v.City) {
   687  			// 		matchingCities = append(matchingCities, v)
   688  			// 	}
   689  			// }
   690  		}
   691  	}
   692  
   693  	// If only one was found, we can stop right here.
   694  	if len(matchingCities) == 1 {
   695  		return matchingCities[0]
   696  		// If more than one was found, we need to guess.
   697  	} else if len(matchingCities) > 1 {
   698  		c = filterMatchingCities(nCo, nSt, matchingCities)
   699  	}
   700  
   701  	return c
   702  }
   703  
   704  func scoreCountryMatch(v GeobedCity, currentKey int, bestMatchingKeys map[int]int, nCo string) {
   705  	// A discovered country name converted into a country code
   706  	if nCo != "" {
   707  		if nCo == v.Country {
   708  			if val, ok := bestMatchingKeys[currentKey]; ok {
   709  				bestMatchingKeys[currentKey] = val + 4
   710  			} else {
   711  				bestMatchingKeys[currentKey] = 4
   712  			}
   713  		}
   714  	}
   715  }
   716  
   717  func scoreStateMatch(v GeobedCity, currentKey int, bestMatchingKeys map[int]int, nSt string) {
   718  	// A discovered state name converted into a region code
   719  	if nSt != "" {
   720  		if nSt == v.Region {
   721  			if val, ok := bestMatchingKeys[currentKey]; ok {
   722  				bestMatchingKeys[currentKey] = val + 4
   723  			} else {
   724  				bestMatchingKeys[currentKey] = 4
   725  			}
   726  		}
   727  	}
   728  }
   729  
   730  func scoreAlternativeNames(v GeobedCity, currentKey int, bestMatchingKeys map[int]int, query string) {
   731  	// If any alternate names can be discovered, take them into consideration.
   732  	if v.CityAlt != "" {
   733  		alts := strings.Fields(v.CityAlt)
   734  		for _, altV := range alts {
   735  			if strings.EqualFold(altV, query) {
   736  				if val, ok := bestMatchingKeys[currentKey]; ok {
   737  					bestMatchingKeys[currentKey] = val + 3
   738  				} else {
   739  					bestMatchingKeys[currentKey] = 3
   740  				}
   741  			}
   742  			// Exact, a case-sensitive match means a lot.
   743  			if altV == query {
   744  				if val, ok := bestMatchingKeys[currentKey]; ok {
   745  					bestMatchingKeys[currentKey] = val + 5
   746  				} else {
   747  					bestMatchingKeys[currentKey] = 5
   748  				}
   749  			}
   750  		}
   751  	}
   752  }
   753  
   754  func scoreFuzzyMatches(v GeobedCity, currentKey int, bestMatchingKeys map[int]int, query, nCo, nSt string, abbrevSlice, nSlice []string) {
   755  
   756  	// Special case. Airport codes and other short 3 letter abbreviations,
   757  	// ie. NYC and SFO
   758  	// Country codes could present problems here. It seems to work for NYC,
   759  	// but not SFO (which there are multiple SFOs actually).
   760  	// Leaving it for now, but airport codes are tricky (though they are
   761  	// popular on Twitter). These must be exact (case sensitive) matches.
   762  	// if len(n) == 3 {
   763  	// 	alts := strings.Split(v.CityAlt, ",")
   764  	// 	for _, altV := range alts {
   765  	// 		if altV != "" {
   766  	// 			if altV == n {
   767  	// 				if val, ok := bestMatchingKeys[currentKey]; ok {
   768  	// 					bestMatchingKeys[currentKey] = val + 4
   769  	// 				} else {
   770  	// 					bestMatchingKeys[currentKey] = 4
   771  	// 				}
   772  	// 			}
   773  	// 		}
   774  	// 	}
   775  	// }
   776  
   777  	// Abbreviations for state/country
   778  	// Region (state/province)
   779  	for _, av := range abbrevSlice {
   780  		lowerAv := toLower(av)
   781  		if len(av) == 2 && strings.EqualFold(v.Region, lowerAv) {
   782  			if val, ok := bestMatchingKeys[currentKey]; ok {
   783  				bestMatchingKeys[currentKey] = val + 5
   784  			} else {
   785  				bestMatchingKeys[currentKey] = 5
   786  			}
   787  		}
   788  
   789  		// Country (worth 2 points if exact match)
   790  		if len(av) == 2 && strings.EqualFold(v.Country, lowerAv) {
   791  			if val, ok := bestMatchingKeys[currentKey]; ok {
   792  				bestMatchingKeys[currentKey] = val + 3
   793  			} else {
   794  				bestMatchingKeys[currentKey] = 3
   795  			}
   796  		}
   797  	}
   798  
   799  	scoreCountryMatch(v, currentKey, bestMatchingKeys, nCo)
   800  	scoreStateMatch(v, currentKey, bestMatchingKeys, nSt)
   801  	scoreAlternativeNames(v, currentKey, bestMatchingKeys, query)
   802  
   803  	// Exact city name matches mean a lot.
   804  	if strings.EqualFold(query, v.City) {
   805  		if val, ok := bestMatchingKeys[currentKey]; ok {
   806  			bestMatchingKeys[currentKey] = val + 7
   807  		} else {
   808  			bestMatchingKeys[currentKey] = 7
   809  		}
   810  	}
   811  
   812  }
   813  
   814  func (g *GeoBed) getBestFuzzyMatches(ranges []r, query, nCo, nSt string, abbrevSlice, nSlice []string) (map[int]int, int) {
   815  
   816  	var bestMatchingKeys = map[int]int{}
   817  	var bestMatchingKey = 0
   818  
   819  	for _, rng := range ranges {
   820  		// When adjusting the range, the keys become out of sync. Start from rng.f
   821  		currentKey := rng.f
   822  
   823  		for _, v := range g.c[rng.f:rng.t] {
   824  			currentKey++
   825  
   826  			// Mainly useful for strings like: "Austin, TX" or "Austin TX"
   827  			// (locations with US state codes). Smile if your location string is this simple.
   828  			if nSt != "" {
   829  				if strings.EqualFold(query, v.City) && strings.EqualFold(nSt, v.Region) {
   830  					bestMatchingKeys[0] = currentKey
   831  					bestMatchingKey = currentKey
   832  					return bestMatchingKeys, bestMatchingKey
   833  				}
   834  			}
   835  
   836  			scoreFuzzyMatches(v, currentKey, bestMatchingKeys, query, nCo, nSt, abbrevSlice, nSlice)
   837  
   838  			for _, ns := range nSlice {
   839  				ns = strings.TrimSuffix(ns, ",")
   840  
   841  				// City (worth 2 points if contains part of string)
   842  				if strings.Contains(toLower(v.City), toLower(ns)) {
   843  					if val, ok := bestMatchingKeys[currentKey]; ok {
   844  						bestMatchingKeys[currentKey] = val + 2
   845  					} else {
   846  						bestMatchingKeys[currentKey] = 2
   847  					}
   848  				}
   849  
   850  				// If there's an exact match, maybe there was noise in the string
   851  				// so it could be the full city name, but unlikely. For
   852  				// example, "New" or "Los" is in many city names.
   853  				// Still, give it a point because it could be the bulkier part
   854  				// of a city name (or the city name could be one word).
   855  				// This has helped in some cases.
   856  				if strings.EqualFold(v.City, ns) {
   857  					if val, ok := bestMatchingKeys[currentKey]; ok {
   858  						bestMatchingKeys[currentKey] = val + 1
   859  					} else {
   860  						bestMatchingKeys[currentKey] = 1
   861  					}
   862  				}
   863  
   864  			}
   865  		}
   866  	}
   867  
   868  	return bestMatchingKeys, bestMatchingKey
   869  }
   870  
   871  // When geocoding, this provides a scored best match.
   872  func (g *GeoBed) fuzzyMatchLocation(n string) GeobedCity {
   873  	nCo, nSt, abbrevSlice, nSlice := g.extractLocationPieces(n)
   874  	// Take the renaming unclassified pieces (those not likely to be
   875  	// abbreviations) and get our search range.
   876  	// These pieces are likely contain the city name. Narrowing down
   877  	// the search range will make the lookup faster.
   878  	ranges := g.getSearchRange(nSlice)
   879  
   880  	bestMatchingKeys, bestMatchingKey := g.getBestFuzzyMatches(ranges, n, nCo, nSt, abbrevSlice, nSlice)
   881  	if len(bestMatchingKeys) == 1 {
   882  		return g.c[bestMatchingKey]
   883  	}
   884  
   885  	// If no country was found, look at population as a factor. Is it obvious?
   886  	if nCo == "" {
   887  		hp := int32(0)
   888  		hpk := 0
   889  		for k, v := range bestMatchingKeys {
   890  			// Add bonus point for having a population 1,000+
   891  			if g.c[k].Population >= 1000 {
   892  				bestMatchingKeys[k] = v + 1
   893  			}
   894  			// Now just add a bonus for having the highest population and points
   895  			if g.c[k].Population > hp {
   896  				hpk = k
   897  				hp = g.c[k].Population
   898  			}
   899  		}
   900  		// Add a point for having the highest population (if any of the results
   901  		// had population data available).
   902  		if g.c[hpk].Population > 0 {
   903  			bestMatchingKeys[hpk] = bestMatchingKeys[hpk] + 1
   904  		}
   905  	}
   906  
   907  	m := 0
   908  	for k, v := range bestMatchingKeys {
   909  		if v > m {
   910  			m = v
   911  			bestMatchingKey = k
   912  		}
   913  
   914  		// If there is a tie breaker, use the city with the higher population
   915  		// (if known) because it's more likely to be what is meant.
   916  		// For example, when people say "New York" they typically mean
   917  		// New York, NY...Though there are many New Yorks.
   918  		if v == m {
   919  			if g.c[k].Population > g.c[bestMatchingKey].Population {
   920  				bestMatchingKey = k
   921  			}
   922  		}
   923  	}
   924  
   925  	// debug
   926  	// log.Println("Possible results:")
   927  	// log.Println(len(bestMatchingKeys))
   928  	// for _, kv := range bestMatchingKeys {
   929  	// 	log.Println(g.c[kv])
   930  	// }
   931  	// log.Println("Best match:")
   932  	// log.Println(g.c[bestMatchingKey])
   933  	// log.Println("Scored:")
   934  	// log.Println(m)
   935  
   936  	return g.c[bestMatchingKey]
   937  }
   938  
   939  // Splits a string up looking for potential abbreviations by matching against
   940  // a shorter list of abbreviations.
   941  // Returns country, state, a slice of strings with potential abbreviations
   942  // (based on size; 2 or 3 characters), and then a slice of the remaning pieces.
   943  // This does a good job at separating things that are clearly abbreviations
   944  // from the city so that searching is faster and more accuarate.
   945  func (g *GeoBed) extractLocationPieces(n string) (string, string, []string, []string) {
   946  	var re = regexp.MustCompile("")
   947  
   948  	// Extract all potential abbreviations.
   949  	re = regexp.MustCompile(`[\S]{2,3}`)
   950  	abbrevSlice := re.FindStringSubmatch(n)
   951  
   952  	// Convert country to country code and pull it out. We'll use it as a
   953  	// secondary form of validation. Remove the code from the original query.
   954  	nCo := ""
   955  	for _, co := range g.co {
   956  		re = regexp.MustCompile("(?i)^" + co.Country + ",?\\s|\\s" + co.Country + ",?\\s" + co.Country + "\\s$")
   957  		if re.MatchString(n) {
   958  			nCo = co.ISO
   959  			// And remove it so we have a cleaner query string for a city.
   960  			n = re.ReplaceAllString(n, "")
   961  		}
   962  	}
   963  
   964  	// Find US State codes and pull them out as well (do not convert
   965  	// state names, they can also easily be city names).
   966  	nSt := ""
   967  	for sc := range UsSateCodes {
   968  		re = regexp.MustCompile("(?i)^" + sc + ",?\\s|\\s" + sc + ",?\\s|\\s" + sc + "$")
   969  		if re.MatchString(n) {
   970  			nSt = sc
   971  			// And remove it too.
   972  			n = re.ReplaceAllString(n, "")
   973  		}
   974  	}
   975  	// Trim spaces and commas off the modified string.
   976  	n = strings.Trim(n, " ,")
   977  
   978  	// Now extract words (potential city names) into a slice. With this,
   979  	//  the index will be referenced to pinpoint sections of
   980  	// the g.c []GeobedCity slice to scan.
   981  	// This results in a much faster lookup. This is over a simple
   982  	// binary search with strings.Search() etc. because the city name
   983  	// may not be the first word.
   984  	// This should not contain any known country code or US state codes.
   985  	nSlice := strings.Split(n, " ")
   986  
   987  	return nCo, nSt, abbrevSlice, nSlice
   988  }
   989  
   990  // There's potentially 2.7 million items to range though, let's see if we can
   991  // reduce that by taking slices of the slice in alphabetical order.
   992  func (g *GeoBed) getSearchRange(nSlice []string) []r {
   993  	// NOTE: A simple binary search was not helping here since
   994  	// we aren't looking for one specific thing.
   995  	// We have multiple elements, city, state, country.
   996  	// So we'd end up with multiple binary searches to piece together which
   997  	//  could be quite a few exponentially given the possible
   998  	// combinations...And so it was slower.
   999  
  1000  	ranges := []r{}
  1001  	for _, ns := range nSlice {
  1002  		ns = strings.TrimSuffix(ns, ",")
  1003  
  1004  		if len(ns) > 0 {
  1005  			// Get the first character in the string, this tells us where to stop.
  1006  			fc := toLower(string(ns[0]))
  1007  			// Get the previous index key (by getting the previous
  1008  			// character in the alphabet) to figure out where to start.
  1009  			pik := string(prev(rune(fc[0])))
  1010  
  1011  			// To/from key
  1012  			fk := 0
  1013  			tk := 0
  1014  			if val, ok := cityNameIdx[pik]; ok {
  1015  				fk = val
  1016  			}
  1017  			if val, ok := cityNameIdx[fc]; ok {
  1018  				tk = val
  1019  			}
  1020  			// Don't let the to key be out of range.
  1021  			if tk == 0 {
  1022  				tk = (len(g.c) - 1)
  1023  			}
  1024  			ranges = append(ranges, r{fk, tk})
  1025  		}
  1026  	}
  1027  
  1028  	return ranges
  1029  }
  1030  
  1031  func prev(r rune) rune {
  1032  	return r - 1
  1033  }
  1034  
  1035  // ReverseGeocode finds place name by latitude and longitude.
  1036  func (g *GeoBed) ReverseGeocode(lat float64, lng float64) GeobedCity {
  1037  	c := GeobedCity{}
  1038  
  1039  	gh := geohash.Encode(lat, lng)
  1040  	// This is produced with empty lat/lng values - don't look for anything.
  1041  	if gh == "7zzzzzzzzzzz" {
  1042  		return c
  1043  	}
  1044  
  1045  	// Note: All geohashes are going to be 12 characters long. Even if the
  1046  	// precision on the lat/lng isn't great. The geohash package will center things.
  1047  	// Obviously lat/lng like 37, -122 is a guess. That's no where near
  1048  	// the resolution of a city. Though we're going to allow guesses.
  1049  	mostMatched := 0
  1050  	matched := 0
  1051  	for k, v := range g.c {
  1052  		// check first two characters to reduce the number of loops
  1053  		if v.Geohash[0] == gh[0] && v.Geohash[1] == gh[1] {
  1054  			matched = 2
  1055  			for i := 2; i <= len(gh); i++ {
  1056  				//log.Println(gh[0:i])
  1057  				if v.Geohash[0:i] == gh[0:i] {
  1058  					matched++
  1059  				}
  1060  			}
  1061  			// tie breakers go to city with larger population (NOTE: There's
  1062  			//  still a chance that the next pass will uncover a better match)
  1063  			if matched == mostMatched && g.c[k].Population > c.Population {
  1064  				c = g.c[k]
  1065  				//log.Println("MATCHES")
  1066  				//log.Println(matched)
  1067  				//log.Println("CITY")
  1068  				//log.Println(c.City)
  1069  				//log.Println("POPULATION")
  1070  				//log.Println(c.Population)
  1071  			}
  1072  			if matched > mostMatched {
  1073  				c = g.c[k]
  1074  				mostMatched = matched
  1075  			}
  1076  		}
  1077  	}
  1078  
  1079  	return c
  1080  }
  1081  
  1082  // A slightly faster lowercase function.
  1083  func toLower(s string) string {
  1084  	b := make([]byte, len(s))
  1085  	for i := range b {
  1086  		c := s[i]
  1087  		if c >= 'A' && c <= 'Z' {
  1088  			c += 'a' - 'A'
  1089  		}
  1090  		b[i] = c
  1091  	}
  1092  	return string(b)
  1093  }
  1094  
  1095  // A slightly faster uppercase function.
  1096  func toUpper(s string) string {
  1097  	b := make([]byte, len(s))
  1098  	for i := range b {
  1099  		c := s[i]
  1100  		if c >= 'a' && c <= 'z' {
  1101  			c -= 'a' - 'A'
  1102  		}
  1103  		b[i] = c
  1104  	}
  1105  	return string(b)
  1106  }
  1107  
  1108  // Dumps the Geobed data to disk. This speeds up startup time on subsequent
  1109  // runs (or if calling NewGeobed() multiple times which should be avoided
  1110  // if possible).
  1111  // TODO: Refactor
  1112  func (g GeoBed) store() error {
  1113  	b := new(bytes.Buffer)
  1114  
  1115  	// Store the city info
  1116  	enc := gob.NewEncoder(b)
  1117  	err := enc.Encode(g.c)
  1118  	if err != nil {
  1119  		b.Reset()
  1120  		return err
  1121  	}
  1122  
  1123  	fh, eopen := os.OpenFile("./geobed-data/g.c.dmp", os.O_CREATE|os.O_WRONLY, 0666)
  1124  	defer fh.Close()
  1125  	if eopen != nil {
  1126  		b.Reset()
  1127  		return eopen
  1128  	}
  1129  	n, e := fh.Write(b.Bytes())
  1130  	if e != nil {
  1131  		b.Reset()
  1132  		return e
  1133  	}
  1134  	log.Printf("%d bytes successfully written to cache file\n", n)
  1135  
  1136  	// Store the country info as well (this is all now repetition - refactor)
  1137  	b.Reset()
  1138  	//enc = gob.NewEncoder(b)
  1139  	err = enc.Encode(g.co)
  1140  	if err != nil {
  1141  		b.Reset()
  1142  		return err
  1143  	}
  1144  
  1145  	fh, eopen = os.OpenFile("./geobed-data/g.co.dmp", os.O_CREATE|os.O_WRONLY, 0666)
  1146  	defer fh.Close()
  1147  	if eopen != nil {
  1148  		b.Reset()
  1149  		return eopen
  1150  	}
  1151  	n, e = fh.Write(b.Bytes())
  1152  	if e != nil {
  1153  		b.Reset()
  1154  		return e
  1155  	}
  1156  	log.Printf("%d bytes successfully written to cache file\n", n)
  1157  
  1158  	// Store the index info (again there's some repetition here)
  1159  	b.Reset()
  1160  	//enc = gob.NewEncoder(b)
  1161  	err = enc.Encode(cityNameIdx)
  1162  	if err != nil {
  1163  		b.Reset()
  1164  		return err
  1165  	}
  1166  
  1167  	fh, eopen = os.OpenFile("./geobed-data/cityNameIdx.dmp", os.O_CREATE|os.O_WRONLY, 0666)
  1168  	defer fh.Close()
  1169  	if eopen != nil {
  1170  		b.Reset()
  1171  		return eopen
  1172  	}
  1173  	n, e = fh.Write(b.Bytes())
  1174  	if e != nil {
  1175  		b.Reset()
  1176  		return e
  1177  	}
  1178  	log.Printf("%d bytes successfully written to cache file\n", n)
  1179  
  1180  	b.Reset()
  1181  	return nil
  1182  }
  1183  
  1184  // Loads a GeobedCity dump, which saves a bit of time.
  1185  func loadGeobedCityData() ([]GeobedCity, error) {
  1186  	fh, err := os.Open("./geobed-data/g.c.dmp")
  1187  	if err != nil {
  1188  		return nil, err
  1189  	}
  1190  	gc := []GeobedCity{}
  1191  	dec := gob.NewDecoder(fh)
  1192  	err = dec.Decode(&gc)
  1193  	if err != nil {
  1194  		return nil, err
  1195  	}
  1196  	return gc, nil
  1197  }
  1198  
  1199  func loadGeobedCountryData() ([]CountryInfo, error) {
  1200  	fh, err := os.Open("./geobed-data/g.co.dmp")
  1201  	if err != nil {
  1202  		return nil, err
  1203  	}
  1204  	co := []CountryInfo{}
  1205  	dec := gob.NewDecoder(fh)
  1206  	err = dec.Decode(&co)
  1207  	if err != nil {
  1208  		return nil, err
  1209  	}
  1210  	return co, nil
  1211  }
  1212  
  1213  func loadGeobedCityNameIdx() error {
  1214  	fh, err := os.Open("./geobed-data/cityNameIdx.dmp")
  1215  	if err != nil {
  1216  		return err
  1217  	}
  1218  	dec := gob.NewDecoder(fh)
  1219  	cityNameIdx = make(map[string]int)
  1220  	err = dec.Decode(&cityNameIdx)
  1221  	if err != nil {
  1222  		return err
  1223  	}
  1224  	return nil
  1225  }