Places: Improve state name normalization #1664

This commit is contained in:
Michael Mayer 2021-11-11 16:00:42 +01:00
parent 4fb00198d2
commit 6c02ee2512
15 changed files with 188 additions and 178 deletions

4
go.mod
View file

@ -21,7 +21,7 @@ require (
github.com/go-playground/validator/v10 v10.9.0 // indirect
github.com/golang/geo v0.0.0-20210211234256-740aa86cb551
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/open-location-code/go v0.0.0-20211109014933-06433367679b
github.com/google/open-location-code/go v0.0.0-20211110234603-604ed00fe9d8
github.com/gorilla/websocket v1.4.2
github.com/gosimple/slug v1.11.2
github.com/h2non/filetype v1.1.1
@ -57,7 +57,7 @@ require (
go4.org v0.0.0-20201209231011-d4a079459e60 // indirect
golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa
golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d // indirect
golang.org/x/net v0.0.0-20211108170745-6635138e15ea
golang.org/x/net v0.0.0-20211111083644-e5c967477495
golang.org/x/sys v0.0.0-20211109065445-02f5c0300f6e // indirect
golang.org/x/text v0.3.7 // indirect
gonum.org/v1/gonum v0.9.3

4
go.sum
View file

@ -153,6 +153,8 @@ github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
github.com/google/open-location-code/go v0.0.0-20211109014933-06433367679b h1:6rfkSqY/nWZGdgpfCLumEAh3Remb/v1eyrGnFt5dCIs=
github.com/google/open-location-code/go v0.0.0-20211109014933-06433367679b/go.mod h1:eJfRN6aj+kR/rnua/rw9jAgYhqoMHldQkdTi+sePRKk=
github.com/google/open-location-code/go v0.0.0-20211110234603-604ed00fe9d8 h1:+C1yt4bGEM1u3akLWEDqtNhRV28xyCrPscLEgE3NGYc=
github.com/google/open-location-code/go v0.0.0-20211110234603-604ed00fe9d8/go.mod h1:eJfRN6aj+kR/rnua/rw9jAgYhqoMHldQkdTi+sePRKk=
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
@ -377,6 +379,8 @@ golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81R
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20211108170745-6635138e15ea h1:FosBMXtOc8Tp9Hbo4ltl1WJSrTVewZU8MPnTPY2HdH8=
golang.org/x/net v0.0.0-20211108170745-6635138e15ea/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20211111083644-e5c967477495 h1:cjxxlQm6d4kYbhpZ2ghvmI8xnq0AG+jXmzrhzfkyu5A=
golang.org/x/net v0.0.0-20211111083644-e5c967477495/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=

View file

@ -135,10 +135,6 @@ func (l Location) Label() (result string) {
return l.Place.LocLabel
}
func (l Location) State() (result string) {
return txt.NormalizeState(l.Place.LocState)
}
func (l Location) City() (result string) {
return l.Place.LocCity
}
@ -147,6 +143,10 @@ func (l Location) CountryCode() (result string) {
return l.Place.LocCountry
}
func (l Location) State() (result string) {
return txt.NormalizeState(l.Place.LocState, l.CountryCode())
}
func (l Location) Latitude() (result float64) {
return l.LocLat
}

View file

@ -41,8 +41,8 @@ func NewLocation(id, name, category, label, city, state, country, source string,
LocCategory: category,
LocLabel: label,
LocCity: city,
LocState: txt.NormalizeState(state),
LocCountry: country,
LocState: txt.NormalizeState(state, country),
LocSource: source,
LocKeywords: keywords,
}
@ -129,7 +129,7 @@ func (l *Location) label() string {
loc = append(loc, l.LocCity)
}
if shortCountry && l.LocState != "" && l.LocCity != l.LocState {
if shortCountry && l.LocState != "" && !strings.EqualFold(l.LocState, l.LocCity) && !strings.EqualFold(l.LocState, l.LocCountry) {
loc = append(loc, l.LocState)
}
@ -164,14 +164,14 @@ func (l Location) City() string {
return l.LocCity
}
func (l Location) State() string {
return txt.NormalizeState(l.LocState)
}
func (l Location) CountryCode() string {
return l.LocCountry
}
func (l Location) State() string {
return txt.NormalizeState(l.LocState, l.CountryCode())
}
func (l Location) CountryName() string {
return CountryNames[l.LocCountry]
}

View file

@ -235,6 +235,26 @@ func TestLocation_place(t *testing.T) {
assert.Equal(t, "Unknown", l.label())
})
t.Run("Freiburg im Breisgau, BW, Germany", func(t *testing.T) {
l := NewLocation("47911b1a4f84", "", "", "Freiburg im Breisgau, BW, Germany", "Freiburg im Breisgau", "BW", "de", "", []string{})
assert.Equal(t, "Freiburg im Breisgau, Baden-Württemberg, Germany", l.label())
})
t.Run("Sevilla, ES, Spain", func(t *testing.T) {
l := NewLocation("0d126c12219c", "", "", "Sevilla, ES, Spain", "Sevilla", "ES", "es", "", []string{})
assert.Equal(t, "Sevilla, Spain", l.label())
})
t.Run("Guarapari, ES, Brazil", func(t *testing.T) {
l := NewLocation("00b85797fdbc", "", "", "Guarapari, ES, Brazil", "Guarapari", "ES", "br", "", []string{})
assert.Equal(t, "Guarapari, Espírito Santo, Brazil", l.label())
})
t.Run("Porto Novo, PT, Portugal", func(t *testing.T) {
l := NewLocation("0d1f30bb5564", "", "", "", "Porto Novo", "PT", "pt", "", []string{})
assert.Equal(t, "Porto Novo, Portugal", l.label())
})
}
func TestLocation_S2Token(t *testing.T) {

View file

@ -86,7 +86,7 @@ func (l Location) CellID() (result string) {
}
func (l Location) State() (result string) {
return txt.NormalizeState(l.Address.State)
return txt.NormalizeState(l.Address.State, l.CountryCode())
}
func (l Location) City() (result string) {

View file

@ -79,7 +79,7 @@ func (m Moment) Slug() string {
// Title returns an english title for the moment.
func (m Moment) Title() string {
state := txt.NormalizeState(m.State)
state := txt.NormalizeState(m.State, m.Country)
if m.Year == 0 && m.Month == 0 {
if m.Label != "" {

View file

@ -1,66 +0,0 @@
//go:build ignore
// +build ignore
// This generates states.go by running "go generate"
package main
import (
"bufio"
"os"
"strings"
"text/template"
"github.com/photoprism/photoprism/pkg/txt"
)
type State struct {
Code string
Name string
}
var states []State
func main() {
file, err := os.Open("./resources/states.txt")
defer file.Close()
if err != nil {
panic(err)
}
scanner := bufio.NewScanner(file)
scanner.Split(bufio.ScanLines)
for scanner.Scan() {
parts := strings.Split(scanner.Text(), ":")
if len(parts) < 2 {
continue
}
states = append(states, State{Code: strings.ToUpper(parts[0]), Name: txt.Title(parts[1])})
}
f, err := os.Create("states.go")
if err != nil {
panic(err)
}
defer f.Close()
packageTemplate.Execute(f, struct {
States []State
}{
States: states,
})
}
var packageTemplate = template.Must(template.New("").Parse(`// Code generated by go generate; DO NOT EDIT.
package txt
var States = map[string]string{
{{- range .States }}
{{ printf "%q" .Code }}: {{ printf "%q" .Name }},
{{- end }}
}`))

7
pkg/txt/lookuptable.go Normal file
View file

@ -0,0 +1,7 @@
package txt
// LookupTable represents a string lookup table.
type LookupTable map[string]string
// LookupTableMap represents a map of string lookup tables.
type LookupTableMap map[string]LookupTable

View file

@ -28,18 +28,31 @@ func NormalizeName(name string) string {
}
// NormalizeState returns the full, normalized state name.
func NormalizeState(s string) string {
s = strings.TrimSpace(s)
func NormalizeState(stateName, countryCode string) string {
// Remove whitespace from name.
stateName = strings.TrimSpace(stateName)
if s == "" || s == UnknownStateCode {
// Empty?
if stateName == "" || stateName == UnknownStateCode {
// State doesn't have a name.
return ""
}
if expanded, ok := States[s]; ok {
return expanded
// Normalize country code.
countryCode = strings.ToLower(strings.TrimSpace(countryCode))
// Is the name an abbreviation that should be normalized?
if states, found := StatesByCountry[countryCode]; !found {
// Unknown country.
} else if normalized, found := states[stateName]; !found {
// Unknown abbreviation.
} else if normalized != "" {
// Yes, use normalized name.
stateName = normalized
}
return s
// Return normalized state name.
return stateName
}
// NormalizeQuery replaces search operator with default symbols.

View file

@ -32,42 +32,57 @@ func TestNormalizeName(t *testing.T) {
func TestNormalizeState(t *testing.T) {
t.Run("Berlin", func(t *testing.T) {
result := NormalizeState("Berlin")
result := NormalizeState("Berlin", "de")
assert.Equal(t, "Berlin", result)
})
t.Run("WA", func(t *testing.T) {
result := NormalizeState("WA")
result := NormalizeState("WA", "us")
assert.Equal(t, "Washington", result)
})
t.Run("QCUnknownCountry", func(t *testing.T) {
result := NormalizeState("QC", "")
assert.Equal(t, "QC", result)
})
t.Run("QCCanada", func(t *testing.T) {
result := NormalizeState("QC", "ca")
assert.Equal(t, "Quebec", result)
})
t.Run("QCUnitedStates", func(t *testing.T) {
result := NormalizeState("QC", "us")
assert.Equal(t, "QC", result)
})
t.Run("Wa", func(t *testing.T) {
result := NormalizeState("Wa")
result := NormalizeState("Wa", "us")
assert.Equal(t, "Wa", result)
})
t.Run("Washington", func(t *testing.T) {
result := NormalizeState("Washington")
result := NormalizeState("Washington", "us")
assert.Equal(t, "Washington", result)
})
t.Run("Never mind Nirvana", func(t *testing.T) {
result := NormalizeState("Never mind Nirvana.")
result := NormalizeState("Never mind Nirvana.", "us")
assert.Equal(t, "Never mind Nirvana.", result)
})
t.Run("Empty", func(t *testing.T) {
result := NormalizeState("")
result := NormalizeState("", "us")
assert.Equal(t, "", result)
})
t.Run("Unknown", func(t *testing.T) {
result := NormalizeState("zz")
result := NormalizeState("zz", "us")
assert.Equal(t, "", result)
})
t.Run("Space", func(t *testing.T) {
result := NormalizeState(" ")
result := NormalizeState(" ", "us")
assert.Equal(t, "", result)
})

View file

@ -1,72 +0,0 @@
AB:Alberta
AL:Alabama
AK:Alaska
AS:American Samoa
AZ:Arizona
AR:Arkansas
BC:British Columbia
CA:California
CO:Colorado
CT:Connecticut
DE:Delaware
DC:District of Columbia
FM:Federated States of Micronesia
FL:Florida
GA:Georgia
GU:Guam
HI:Hawaii
ID:Idaho
IL:Illinois
IN:Indiana
IA:Iowa
KS:Kansas
KY:Kentucky
LA:Louisiana
MB:Manitoba
ME:Maine
MH:Marshall Islands
MD:Maryland
MA:Massachusetts
MI:Michigan
MN:Minnesota
MS:Mississippi
MO:Missouri
MT:Montana
NE:Nebraska
NL:Newfoundland and Labrador
NU:Nunavut
NV:Nevada
NB:New Brunswick
NH:New Hampshire
NJ:New Jersey
NM:New Mexico
NY:New York
NC:North Carolina
ND:North Dakota
NT:Northwest Territories
NS:Nova Scotia
MP:Northern Mariana Islands
OH:Ohio
OK:Oklahoma
ON:Ontario
OR:Oregon
PE:Prince Edward Island
PW:Palau
PA:Pennsylvania
PR:Puerto Rico
QC:Quebec
RI:Rhode Island
SK:Saskatchewan
SC:South Carolina
SD:South Dakota
TN:Tennessee
TX:Texas
UT:Utah
VT:Vermont
VI:Virgin Islands
VA:Virginia
WA:Washington
WV:West Virginia
WI:Wisconsin
WY:Wyoming
YT:Yukon

View file

@ -1,14 +1,88 @@
// Code generated by go generate; DO NOT EDIT.
package txt
var States = map[string]string{
// StatesByCountry maps state names by country code.
var StatesByCountry = LookupTableMap{
"br": StatesBR,
"ca": StatesCA,
"de": StatesDE,
"us": StatesUS,
}
// StatesBR maps common abbreviations for Brazilian states.
var StatesBR = LookupTable{
"AC": "Acre",
"AL": "Alagoas",
"AM": "Amazonas",
"AP": "Amapá",
"BA": "Bahia",
"CE": "Ceará",
"DF": "Distrito Federal",
"ES": "Espírito Santo",
"GO": "Goiás",
"MA": "Maranhão",
"MG": "Minas Gerais",
"MS": "Mato Grosso do Sul",
"MT": "Mato Grosso",
"PA": "Pará",
"PB": "Paraíba",
"PE": "Pernambuco",
"PI": "Piauí",
"PR": "Paraná",
"RJ": "Rio de Janeiro",
"RN": "Rio Grande do Norte",
"RO": "Rondônia",
"RR": "Roraima",
"RS": "Rio Grande do Sul",
"SC": "Santa Catarina",
"SE": "Sergipe",
"SP": "São Paulo",
"TO": "Tocantins",
}
// StatesCA maps common abbreviations for Canadian provinces and territories.
var StatesCA = LookupTable{
"AB": "Alberta",
"BC": "British Columbia",
"NB": "New Brunswick",
"NL": "Newfoundland and Labrador",
"NS": "Nova Scotia",
"NT": "Northwest Territories",
"NU": "Nunavut",
"MB": "Manitoba",
"ON": "Ontario",
"PE": "Prince Edward Island",
"QC": "Quebec",
"SK": "Saskatchewan",
"YT": "Yukon",
}
// StatesDE maps common abbreviations for German states.
var StatesDE = LookupTable{
"BW": "Baden-Württemberg",
"BY": "Bayern",
"BE": "Berlin",
"BB": "Brandenburg",
"HB": "Bremen",
"HH": "Hamburg",
"HE": "Hessen",
"NI": "Niedersachsen",
"MV": "Mecklenburg-Vorpommern",
"NW": "Nordrhein-Westfalen",
"RP": "Rheinland-Pfalz",
"SL": "Saarland",
"SN": "Sachsen",
"ST": "Sachsen-Anhalt",
"SH": "Schleswig-Holstein",
"TH": "Thüringen",
}
// StatesUS maps common abbreviations for US states.
var StatesUS = LookupTable{
"AL": "Alabama",
"AK": "Alaska",
"AS": "American Samoa",
"AZ": "Arizona",
"AR": "Arkansas",
"BC": "British Columbia",
"CA": "California",
"CO": "Colorado",
"CT": "Connecticut",
@ -37,28 +111,21 @@ var States = map[string]string{
"MO": "Missouri",
"MT": "Montana",
"NE": "Nebraska",
"NL": "Newfoundland and Labrador",
"NU": "Nunavut",
"NV": "Nevada",
"NB": "New Brunswick",
"NH": "New Hampshire",
"NJ": "New Jersey",
"NM": "New Mexico",
"NY": "New York",
"NC": "North Carolina",
"ND": "North Dakota",
"NT": "Northwest Territories",
"NS": "Nova Scotia",
"MP": "Northern Mariana Islands",
"OH": "Ohio",
"OK": "Oklahoma",
"ON": "Ontario",
"OR": "Oregon",
"PE": "Prince Edward Island",
"PW": "Palau",
"PA": "Pennsylvania",
"PR": "Puerto Rico",
"QC": "Quebec",
"RI": "Rhode Island",
"SK": "Saskatchewan",
"SC": "South Carolina",
@ -73,5 +140,4 @@ var States = map[string]string{
"WV": "West Virginia",
"WI": "Wisconsin",
"WY": "Wyoming",
"YT": "Yukon",
}

24
pkg/txt/states_test.go Normal file
View file

@ -0,0 +1,24 @@
package txt
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestStatesByCountry(t *testing.T) {
t.Run("QCUnknownCountry", func(t *testing.T) {
result := StatesByCountry[""]["QC"]
assert.Equal(t, "", result)
})
t.Run("QCCanada", func(t *testing.T) {
result := StatesByCountry["ca"]["QC"]
assert.Equal(t, "Quebec", result)
})
t.Run("QCUnitedStates", func(t *testing.T) {
result := StatesByCountry["us"]["QC"]
assert.Equal(t, "", result)
})
}

View file

@ -32,6 +32,5 @@ https://docs.photoprism.org/developer-guide/
package txt
//go:generate go run gen_countries.go
//go:generate go run gen_states.go
//go:generate go run gen_stopwords.go
//go:generate go fmt .