Files
ScraperControl/scripts/enrich-with-forward-geocode.ts
albertfj114 76cca3ba75 feat: improve forward geocoding with fallback street extraction
Adds extractStreetAddress() to strip institution name prefixes
("Canossa School (H.K.) 8 Hoi Chak Street" → "8 Hoi Chak Street").
Also cleans Kln./R.E./Lantau Island suffixes. Falls back to the
street-only query if the full address returns no result, marking
results with [FOUND (fallback)] in output.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 21:40:25 -04:00

227 lines
7.5 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Enrich churches that have lat/lng=0 with real coordinates via Nominatim forward geocoding.
* After this runs, enrich-with-reverse-geocode fills city/state from the new coordinates.
*
* Usage:
* npx tsx scripts/enrich-with-forward-geocode.ts --country HK --dry-run
* npx tsx scripts/enrich-with-forward-geocode.ts --country HK
* npx tsx scripts/enrich-with-forward-geocode.ts --limit 10
*
* Rate limit: 1 request/second (Nominatim usage policy — mandatory).
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import axios from 'axios';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const NOMINATIM_SEARCH_URL = 'https://nominatim.openstreetmap.org/search';
const RATE_LIMIT_MS = 1100;
// Some regions use a different ISO code in OSM than in our DB
const NOMINATIM_COUNTRY_MAP: Record<string, string> = {
HK: 'cn', // Hong Kong is part of China in OSM
MO: 'cn', // Macau likewise
};
interface ChurchRecord {
id: string;
name: string;
address: string;
country: string;
city: string | null;
state: string | null;
}
interface NominatimSearchResult {
lat: string;
lon: string;
display_name: string;
address?: {
city?: string;
town?: string;
village?: string;
municipality?: string;
state?: string;
province?: string;
};
}
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
function cleanAddress(address: string): string {
return address
// Strip trailing city/region suffixes
.replace(/,?\s*(H\.K\.|HK|Hong Kong|Kowloon|Kln\.|New Territories|N\.T\.|Lantau Island)\.?\s*$/i, '')
// Strip "R.E." (Religious Education suffix used in HK addresses)
.replace(/,?\s*R\.E\./i, '')
.replace(/\.$/, '')
.trim();
}
/**
* Fallback: strip any leading non-numeric institution name prefix and floor/unit designators,
* returning just the street number onwards. Handles patterns like:
* "Canossa School (H.K.) 8 Hoi Chak Street" → "8 Hoi Chak Street"
* "G/F., Wai Ming Block, 111 Wing Hong Street" → "111 Wing Hong Street"
* "3/F., Chi Wo Commercial Building, 20 Saigon Street" → "20 Saigon Street"
*/
function extractStreetAddress(address: string): string | null {
// Find the first occurrence of a standalone number (house number)
const match = address.match(/(?:^|,\s*)(\d+[A-Za-z]?(?:\s|,).*)/);
if (!match) return null;
const candidate = match[1].trim();
// Must be meaningfully shorter than the full address to be worth retrying
return candidate.length < address.length * 0.9 ? cleanAddress(candidate) : null;
}
async function nominatimSearch(query: string, nominatimCountry: string): Promise<NominatimSearchResult | null> {
const response = await axios.get(NOMINATIM_SEARCH_URL, {
params: {
q: query,
format: 'json',
limit: 1,
countrycodes: nominatimCountry,
addressdetails: 1,
},
headers: {
'User-Agent': 'NearestMass/1.0 (privacy@nearestmass.com)',
'Accept-Language': 'en',
},
timeout: 15000,
});
const results: NominatimSearchResult[] = response.data;
return results.length > 0 ? results[0] : null;
}
async function forwardGeocode(
address: string,
countryCode: string
): Promise<{ result: NominatimSearchResult; usedFallback: boolean } | null> {
const nominatimCountry = NOMINATIM_COUNTRY_MAP[countryCode] ?? countryCode.toLowerCase();
const cleaned = cleanAddress(address);
const primary = await nominatimSearch(cleaned, nominatimCountry);
if (primary) return { result: primary, usedFallback: false };
// Fallback: try just the street-number-onwards portion
const streetOnly = extractStreetAddress(address);
if (streetOnly && streetOnly !== cleaned) {
await sleep(RATE_LIMIT_MS); // respect rate limit between retries
const fallback = await nominatimSearch(streetOnly, nominatimCountry);
if (fallback) return { result: fallback, usedFallback: true };
}
return null;
}
async function main() {
const args = process.argv.slice(2);
const dryRun = args.includes('--dry-run');
const countryIdx = args.indexOf('--country');
const limitIdx = args.indexOf('--limit');
const countryCode = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1], 10) : undefined;
log('============================================================');
log('Nominatim Forward Geocode Enrichment');
log('============================================================');
log(`Country: ${countryCode || 'All'}`);
log(`Limit: ${limit || 'No limit'}`);
log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
log('============================================================');
const churches = await prisma.church.findMany({
where: {
latitude: 0,
longitude: 0,
address: { not: null },
...(countryCode ? { country: countryCode } : {}),
},
select: { id: true, name: true, address: true, country: true, city: true, state: true },
orderBy: { createdAt: 'asc' },
take: limit,
}) as ChurchRecord[];
log(`Found ${churches.length} churches with lat/lng=0 and an address\n`);
const stats = { found: 0, notFound: 0, errors: 0 };
for (const church of churches) {
try {
const geocoded = await forwardGeocode(church.address, church.country);
if (!geocoded) {
log(` - [NOT FOUND] ${church.name} | ${church.address}`);
stats.notFound++;
} else {
const { result, usedFallback } = geocoded;
const lat = parseFloat(result.lat);
const lng = parseFloat(result.lon);
const city = result.address?.city || result.address?.town ||
result.address?.village || result.address?.municipality || null;
const state = result.address?.state || result.address?.province || null;
log(` + [FOUND${usedFallback ? ' (fallback)' : ''}] ${church.name}`);
log(` ${church.address}`);
log(`${lat}, ${lng}${city ? ` (${city})` : ''}`);
if (!dryRun) {
const updateData: Record<string, unknown> = { latitude: lat, longitude: lng };
if (city && !church.city) updateData.city = city;
if (state && !church.state) updateData.state = state;
await prisma.church.update({
where: { id: church.id },
data: updateData,
});
}
stats.found++;
}
} catch (err: any) {
log(` ! [ERROR] ${church.name}: ${err.message}`);
stats.errors++;
}
await sleep(RATE_LIMIT_MS);
}
log('');
log('============================================================');
log('Forward Geocode Summary');
log('============================================================');
log(`Found coords: ${stats.found}`);
log(`Not found: ${stats.notFound}`);
log(`Errors: ${stats.errors}`);
log('============================================================');
await prisma.$disconnect();
await pool.end();
}
main().catch(err => {
console.error('Fatal error:', err);
process.exit(1);
});