From 76cca3ba7582fee1e3de0ca2dc8278f703ce0246 Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Fri, 3 Apr 2026 21:40:25 -0400 Subject: [PATCH] feat: improve forward geocoding with fallback street extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds extractStreetAddress() to strip institution name prefixes ("Canossa School (H.K.) 8 Hoi Chak Street" → "8 Hoi Chak Street"). Also cleans Kln./R.E./Lantau Island suffixes. Falls back to the street-only query if the full address returns no result, marking results with [FOUND (fallback)] in output. Co-Authored-By: Claude Sonnet 4.6 --- scripts/enrich-with-forward-geocode.ts | 56 +++++++++++++++++++++----- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/scripts/enrich-with-forward-geocode.ts b/scripts/enrich-with-forward-geocode.ts index bc26477..b4277c3 100644 --- a/scripts/enrich-with-forward-geocode.ts +++ b/scripts/enrich-with-forward-geocode.ts @@ -71,17 +71,34 @@ function sleep(ms: number): Promise { function cleanAddress(address: string): string { return address - .replace(/,?\s*(H\.K\.|HK|Hong Kong|Kowloon|New Territories|N\.T\.)\.?\s*$/i, '') + // Strip trailing city/region suffixes + .replace(/,?\s*(H\.K\.|HK|Hong Kong|Kowloon|Kln\.|New Territories|N\.T\.|Lantau Island)\.?\s*$/i, '') + // Strip "R.E." (Religious Education suffix used in HK addresses) + .replace(/,?\s*R\.E\./i, '') .replace(/\.$/, '') .trim(); } -async function forwardGeocode(address: string, countryCode: string): Promise { - const nominatimCountry = NOMINATIM_COUNTRY_MAP[countryCode] ?? countryCode.toLowerCase(); - const cleanedAddress = cleanAddress(address); +/** + * Fallback: strip any leading non-numeric institution name prefix and floor/unit designators, + * returning just the street number onwards. Handles patterns like: + * "Canossa School (H.K.) 8 Hoi Chak Street" → "8 Hoi Chak Street" + * "G/F., Wai Ming Block, 111 Wing Hong Street" → "111 Wing Hong Street" + * "3/F., Chi Wo Commercial Building, 20 Saigon Street" → "20 Saigon Street" + */ +function extractStreetAddress(address: string): string | null { + // Find the first occurrence of a standalone number (house number) + const match = address.match(/(?:^|,\s*)(\d+[A-Za-z]?(?:\s|,).*)/); + if (!match) return null; + const candidate = match[1].trim(); + // Must be meaningfully shorter than the full address to be worth retrying + return candidate.length < address.length * 0.9 ? cleanAddress(candidate) : null; +} + +async function nominatimSearch(query: string, nominatimCountry: string): Promise { const response = await axios.get(NOMINATIM_SEARCH_URL, { params: { - q: cleanedAddress, + q: query, format: 'json', limit: 1, countrycodes: nominatimCountry, @@ -93,11 +110,31 @@ async function forwardGeocode(address: string, countryCode: string): Promise 0 ? results[0] : null; } +async function forwardGeocode( + address: string, + countryCode: string +): Promise<{ result: NominatimSearchResult; usedFallback: boolean } | null> { + const nominatimCountry = NOMINATIM_COUNTRY_MAP[countryCode] ?? countryCode.toLowerCase(); + const cleaned = cleanAddress(address); + + const primary = await nominatimSearch(cleaned, nominatimCountry); + if (primary) return { result: primary, usedFallback: false }; + + // Fallback: try just the street-number-onwards portion + const streetOnly = extractStreetAddress(address); + if (streetOnly && streetOnly !== cleaned) { + await sleep(RATE_LIMIT_MS); // respect rate limit between retries + const fallback = await nominatimSearch(streetOnly, nominatimCountry); + if (fallback) return { result: fallback, usedFallback: true }; + } + + return null; +} + async function main() { const args = process.argv.slice(2); const dryRun = args.includes('--dry-run'); @@ -132,19 +169,20 @@ async function main() { for (const church of churches) { try { - const result = await forwardGeocode(church.address, church.country); + const geocoded = await forwardGeocode(church.address, church.country); - if (!result) { + if (!geocoded) { log(` - [NOT FOUND] ${church.name} | ${church.address}`); stats.notFound++; } else { + const { result, usedFallback } = geocoded; const lat = parseFloat(result.lat); const lng = parseFloat(result.lon); const city = result.address?.city || result.address?.town || result.address?.village || result.address?.municipality || null; const state = result.address?.state || result.address?.province || null; - log(` + [FOUND] ${church.name}`); + log(` + [FOUND${usedFallback ? ' (fallback)' : ''}] ${church.name}`); log(` ${church.address}`); log(` → ${lat}, ${lng}${city ? ` (${city})` : ''}`);