feat: improve forward geocoding with fallback street extraction
Adds extractStreetAddress() to strip institution name prefixes
("Canossa School (H.K.) 8 Hoi Chak Street" → "8 Hoi Chak Street").
Also cleans Kln./R.E./Lantau Island suffixes. Falls back to the
street-only query if the full address returns no result, marking
results with [FOUND (fallback)] in output.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -71,17 +71,34 @@ function sleep(ms: number): Promise<void> {
|
|||||||
|
|
||||||
function cleanAddress(address: string): string {
|
function cleanAddress(address: string): string {
|
||||||
return address
|
return address
|
||||||
.replace(/,?\s*(H\.K\.|HK|Hong Kong|Kowloon|New Territories|N\.T\.)\.?\s*$/i, '')
|
// Strip trailing city/region suffixes
|
||||||
|
.replace(/,?\s*(H\.K\.|HK|Hong Kong|Kowloon|Kln\.|New Territories|N\.T\.|Lantau Island)\.?\s*$/i, '')
|
||||||
|
// Strip "R.E." (Religious Education suffix used in HK addresses)
|
||||||
|
.replace(/,?\s*R\.E\./i, '')
|
||||||
.replace(/\.$/, '')
|
.replace(/\.$/, '')
|
||||||
.trim();
|
.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
async function forwardGeocode(address: string, countryCode: string): Promise<NominatimSearchResult | null> {
|
/**
|
||||||
const nominatimCountry = NOMINATIM_COUNTRY_MAP[countryCode] ?? countryCode.toLowerCase();
|
* Fallback: strip any leading non-numeric institution name prefix and floor/unit designators,
|
||||||
const cleanedAddress = cleanAddress(address);
|
* returning just the street number onwards. Handles patterns like:
|
||||||
|
* "Canossa School (H.K.) 8 Hoi Chak Street" → "8 Hoi Chak Street"
|
||||||
|
* "G/F., Wai Ming Block, 111 Wing Hong Street" → "111 Wing Hong Street"
|
||||||
|
* "3/F., Chi Wo Commercial Building, 20 Saigon Street" → "20 Saigon Street"
|
||||||
|
*/
|
||||||
|
function extractStreetAddress(address: string): string | null {
|
||||||
|
// Find the first occurrence of a standalone number (house number)
|
||||||
|
const match = address.match(/(?:^|,\s*)(\d+[A-Za-z]?(?:\s|,).*)/);
|
||||||
|
if (!match) return null;
|
||||||
|
const candidate = match[1].trim();
|
||||||
|
// Must be meaningfully shorter than the full address to be worth retrying
|
||||||
|
return candidate.length < address.length * 0.9 ? cleanAddress(candidate) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function nominatimSearch(query: string, nominatimCountry: string): Promise<NominatimSearchResult | null> {
|
||||||
const response = await axios.get(NOMINATIM_SEARCH_URL, {
|
const response = await axios.get(NOMINATIM_SEARCH_URL, {
|
||||||
params: {
|
params: {
|
||||||
q: cleanedAddress,
|
q: query,
|
||||||
format: 'json',
|
format: 'json',
|
||||||
limit: 1,
|
limit: 1,
|
||||||
countrycodes: nominatimCountry,
|
countrycodes: nominatimCountry,
|
||||||
@@ -93,11 +110,31 @@ async function forwardGeocode(address: string, countryCode: string): Promise<Nom
|
|||||||
},
|
},
|
||||||
timeout: 15000,
|
timeout: 15000,
|
||||||
});
|
});
|
||||||
|
|
||||||
const results: NominatimSearchResult[] = response.data;
|
const results: NominatimSearchResult[] = response.data;
|
||||||
return results.length > 0 ? results[0] : null;
|
return results.length > 0 ? results[0] : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function forwardGeocode(
|
||||||
|
address: string,
|
||||||
|
countryCode: string
|
||||||
|
): Promise<{ result: NominatimSearchResult; usedFallback: boolean } | null> {
|
||||||
|
const nominatimCountry = NOMINATIM_COUNTRY_MAP[countryCode] ?? countryCode.toLowerCase();
|
||||||
|
const cleaned = cleanAddress(address);
|
||||||
|
|
||||||
|
const primary = await nominatimSearch(cleaned, nominatimCountry);
|
||||||
|
if (primary) return { result: primary, usedFallback: false };
|
||||||
|
|
||||||
|
// Fallback: try just the street-number-onwards portion
|
||||||
|
const streetOnly = extractStreetAddress(address);
|
||||||
|
if (streetOnly && streetOnly !== cleaned) {
|
||||||
|
await sleep(RATE_LIMIT_MS); // respect rate limit between retries
|
||||||
|
const fallback = await nominatimSearch(streetOnly, nominatimCountry);
|
||||||
|
if (fallback) return { result: fallback, usedFallback: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
const args = process.argv.slice(2);
|
const args = process.argv.slice(2);
|
||||||
const dryRun = args.includes('--dry-run');
|
const dryRun = args.includes('--dry-run');
|
||||||
@@ -132,19 +169,20 @@ async function main() {
|
|||||||
|
|
||||||
for (const church of churches) {
|
for (const church of churches) {
|
||||||
try {
|
try {
|
||||||
const result = await forwardGeocode(church.address, church.country);
|
const geocoded = await forwardGeocode(church.address, church.country);
|
||||||
|
|
||||||
if (!result) {
|
if (!geocoded) {
|
||||||
log(` - [NOT FOUND] ${church.name} | ${church.address}`);
|
log(` - [NOT FOUND] ${church.name} | ${church.address}`);
|
||||||
stats.notFound++;
|
stats.notFound++;
|
||||||
} else {
|
} else {
|
||||||
|
const { result, usedFallback } = geocoded;
|
||||||
const lat = parseFloat(result.lat);
|
const lat = parseFloat(result.lat);
|
||||||
const lng = parseFloat(result.lon);
|
const lng = parseFloat(result.lon);
|
||||||
const city = result.address?.city || result.address?.town ||
|
const city = result.address?.city || result.address?.town ||
|
||||||
result.address?.village || result.address?.municipality || null;
|
result.address?.village || result.address?.municipality || null;
|
||||||
const state = result.address?.state || result.address?.province || null;
|
const state = result.address?.state || result.address?.province || null;
|
||||||
|
|
||||||
log(` + [FOUND] ${church.name}`);
|
log(` + [FOUND${usedFallback ? ' (fallback)' : ''}] ${church.name}`);
|
||||||
log(` ${church.address}`);
|
log(` ${church.address}`);
|
||||||
log(` → ${lat}, ${lng}${city ? ` (${city})` : ''}`);
|
log(` → ${lat}, ${lng}${city ? ` (${city})` : ''}`);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user