fix: clean up church-matcher types and add HK OSM bounding box

- Remove discovermassId/buscarmisasNetworkId from findDuplicateChurch match
  passes (importers now do their own pre-check dedup); restore as optional
  fields on ExistingChurch to keep type/runtime in sync
- Add HK bounding box to COUNTRY_BOUNDING_BOXES; fix silent 0-result
  fallback when country query returns empty from mirror server
- discovermass importer: add --limit flag and skip-already-imported
  pre-check using importedSlugs set
- Import scripts: remove discovermassId from ExistingChurch select/stubs
  (field not needed in shared matcher context)
- Schema: reorder discovermassId/kerknetId/gottesdienstzeitenId fields

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
albertfj114
2026-04-01 22:20:45 -04:00
parent 3bd4d2e2f9
commit 033f805965
21 changed files with 499 additions and 64 deletions

View File

@@ -27,8 +27,8 @@ export interface ExistingChurch {
miserendId: string | null;
kerknetId: string | null;
gottesdienstzeitenId: string | null;
discovermassId: string | null;
buscarmisasNetworkId: string | null;
discovermassId?: string | null;
buscarmisasNetworkId?: string | null;
source: string;
website: string | null;
phone: string | null;
@@ -138,8 +138,6 @@ export type ChurchCandidate = {
miserendId?: string;
kerknetId?: string;
gottesdienstzeitenId?: string;
discovermassId?: string;
buscarmisasNetworkId?: string;
};
/**
@@ -149,8 +147,8 @@ export type ChurchCandidate = {
* Matching strategy (in priority order):
* 1. Exact osmId match
* 2. Exact baiduId match
* 3-15. Exact importer ID matches (orarimesse, massSchedulesPh, philmass, horariosMisas, mszeInfo, weekdayMasses, messesInfo, bohosluzby, miserend, kerknet, gottesdienstzeiten, discovermass, buscarmisasNetwork)
* 16. Proximity + name similarity (within 200m + similar name)
* 3-9. Exact importer ID matches (orarimesse, massSchedulesPh, philmass, horariosMisas, mszeInfo, weekdayMasses, messesInfo)
* 10. Proximity + name similarity (within 200m + similar name)
*/
export function findDuplicateChurch(
candidate: ChurchCandidate,
@@ -260,23 +258,7 @@ export function findDuplicateChurch(
if (gdzMatch) return gdzMatch;
}
// Fourteenth pass: exact discovermassId match
if (candidate.discovermassId) {
const match = existingChurches.find(
(church) => church.discovermassId === candidate.discovermassId
);
if (match) return match;
}
// Fifteenth pass: exact buscarmisasNetworkId match
if (candidate.buscarmisasNetworkId) {
const match = existingChurches.find(
(church) => church.buscarmisasNetworkId === candidate.buscarmisasNetworkId
);
if (match) return match;
}
// Sixteenth pass: proximity + name match (skip if candidate has no real coordinates)
// Fourteenth pass: proximity + name match (skip if candidate has no real coordinates)
if (candidate.lat === 0 && candidate.lng === 0) {
return null;
}

472
src/lib/overpass-client.ts Normal file
View File

@@ -0,0 +1,472 @@
/**
* Overpass API Client for querying OpenStreetMap data
* Used to import Catholic churches globally
*/
export interface OSMChurch {
osmId: string; // "node/12345" or "way/67890"
name: string;
lat: number;
lng: number;
address?: string;
city?: string;
state?: string;
zip?: string;
country?: string; // ISO 3166-1 alpha-2 code
phone?: string;
website?: string;
diocese?: string;
wheelchairAccess?: boolean;
serviceTimes?: string; // OSM service_times tag (opening_hours syntax)
}
// Public Overpass API endpoints for failover
const OVERPASS_ENDPOINTS = [
'https://overpass-api.de/api/interpreter',
'https://overpass.osm.ch/api/interpreter',
'https://overpass.kumi.systems/api/interpreter',
];
// Regional bounding boxes for countries that timeout on area queries
export const COUNTRY_BOUNDING_BOXES: Record<string, Array<{ name: string; south: number; west: number; north: number; east: number }>> = {
GB: [
{ name: "England South", south: 49.9, west: -5.8, north: 52.5, east: 1.8 },
{ name: "England North + Wales", south: 52.5, west: -5.8, north: 55.8, east: 1.8 },
{ name: "Scotland", south: 55.0, west: -8.0, north: 60.9, east: -0.7 },
{ name: "Northern Ireland", south: 54.0, west: -8.2, north: 55.4, east: -5.4 },
],
PL: [
{ name: "North", south: 52.0, west: 14.0, north: 54.9, east: 24.2 },
{ name: "South", south: 49.0, west: 14.0, north: 52.0, east: 24.2 },
],
PT: [
{ name: "North", south: 40.0, west: -9.6, north: 42.2, east: -6.0 },
{ name: "South", south: 36.9, west: -9.6, north: 40.0, east: -6.0 },
],
IT: [
{ name: "North", south: 44.0, west: 6.6, north: 47.1, east: 13.8 },
{ name: "Central", south: 41.0, west: 9.5, north: 44.0, east: 15.0 },
{ name: "South + Sicily", south: 36.6, west: 12.4, north: 41.0, east: 18.6 },
{ name: "Sardinia", south: 38.8, west: 8.1, north: 41.3, east: 9.9 },
],
FR: [
{ name: "Northwest", south: 47.0, west: -5.2, north: 51.1, east: 2.0 },
{ name: "Northeast", south: 47.0, west: 2.0, north: 51.1, east: 8.3 },
{ name: "Southwest", south: 42.3, west: -2.0, north: 47.0, east: 2.0 },
{ name: "Southeast", south: 42.3, west: 2.0, north: 47.0, east: 7.8 },
],
ES: [
{ name: "North", south: 42.0, west: -9.3, north: 43.8, east: 3.4 },
{ name: "Central", south: 39.0, west: -7.0, north: 42.0, east: 3.4 },
{ name: "South + Balearic", south: 36.0, west: -7.5, north: 39.0, east: 4.4 },
],
DE: [
{ name: "North", south: 52.0, west: 5.9, north: 55.1, east: 15.1 },
{ name: "Central", south: 49.5, west: 5.9, north: 52.0, east: 15.1 },
{ name: "South", south: 47.3, west: 5.9, north: 49.5, east: 15.1 },
],
PH: [
{ name: "Luzon", south: 12.0, west: 119.0, north: 19.0, east: 127.0 },
{ name: "Visayas", south: 9.0, west: 121.0, north: 12.0, east: 125.5 },
{ name: "Mindanao", south: 5.0, west: 121.0, north: 9.5, east: 127.0 },
],
HN: [
{ name: "West", south: 13.0, west: -89.4, north: 16.0, east: -87.0 },
{ name: "East", south: 13.0, west: -87.0, north: 16.5, east: -83.1 },
],
BR: [
{ name: "North", south: -5.0, west: -74.0, north: 5.3, east: -35.0 },
{ name: "Northeast", south: -13.0, west: -46.0, north: -5.0, east: -35.0 },
{ name: "Central-West", south: -24.0, west: -60.0, north: -5.0, east: -46.0 },
{ name: "Southeast", south: -24.0, west: -53.0, north: -13.0, east: -39.0 },
{ name: "South", south: -33.8, west: -58.0, north: -24.0, east: -48.0 },
],
NG: [
{ name: "South", south: 4.0, west: 2.7, north: 8.0, east: 14.7 },
{ name: "North", south: 8.0, west: 2.7, north: 14.0, east: 14.7 },
],
IN: [
{ name: "South", south: 8.0, west: 73.0, north: 16.0, east: 80.5 },
{ name: "Central", south: 16.0, west: 72.0, north: 24.0, east: 88.0 },
{ name: "North", south: 24.0, west: 68.0, north: 37.0, east: 97.5 },
{ name: "Northeast + East Coast", south: 16.0, west: 80.5, north: 28.0, east: 97.5 },
],
CD: [
{ name: "West", south: -13.5, west: 12.0, north: 5.5, east: 24.0 },
{ name: "East", south: -13.5, west: 24.0, north: 5.5, east: 31.5 },
],
AU: [
{ name: "East Coast", south: -39.0, west: 140.0, north: -10.0, east: 154.0 },
{ name: "West + Central", south: -39.0, west: 112.0, north: -10.0, east: 140.0 },
],
US: [
{ name: "Northeast", south: 37.0, west: -82.0, north: 47.5, east: -66.9 },
{ name: "Southeast", south: 24.5, west: -91.7, north: 37.0, east: -75.0 },
{ name: "Midwest", south: 36.0, west: -104.1, north: 49.4, east: -82.0 },
{ name: "West", south: 24.5, west: -125.0, north: 49.4, east: -104.1 },
],
MX: [
{ name: "North", south: 25.0, west: -118.0, north: 32.8, east: -97.0 },
{ name: "Central", south: 18.0, west: -106.0, north: 25.0, east: -96.0 },
{ name: "South", south: 14.5, west: -118.0, north: 18.0, east: -86.7 },
],
AR: [
{ name: "North", south: -30.0, west: -74.0, north: -21.8, east: -53.6 },
{ name: "Central", south: -40.0, west: -72.0, north: -30.0, east: -56.7 },
{ name: "Patagonia", south: -55.1, west: -74.0, north: -40.0, east: -63.0 },
],
CO: [
{ name: "North", south: 5.0, west: -79.0, north: 12.5, east: -66.9 },
{ name: "South", south: -4.2, west: -79.0, north: 5.0, east: -66.9 },
],
CA: [
{ name: "BC + Alberta", south: 48.3, west: -139.1, north: 60.0, east: -110.0 },
{ name: "Ontario", south: 41.7, west: -95.2, north: 56.9, east: -74.3 },
{ name: "Quebec", south: 45.0, west: -79.8, north: 62.6, east: -57.1 },
{ name: "Atlantic + Prairies", south: 43.4, west: -110.0, north: 60.0, east: -52.6 },
],
ID: [
{ name: "Sumatra + Java", south: -8.8, west: 95.0, north: 5.9, east: 114.6 },
{ name: "Kalimantan + Sulawesi", south: -5.0, west: 114.6, north: 4.0, east: 127.5 },
{ name: "Eastern Indonesia", south: -10.5, west: 127.5, north: 0.9, east: 141.0 },
],
CN: [
{ name: "North", south: 35.0, west: 73.5, north: 53.6, east: 135.1 },
{ name: "East", south: 24.0, west: 113.0, north: 35.0, east: 123.0 },
{ name: "South", south: 18.2, west: 97.5, north: 24.0, east: 113.0 },
{ name: "West", south: 24.0, west: 73.5, north: 35.0, east: 113.0 },
],
RU: [
{ name: "West (European Russia)", south: 45.0, west: 27.0, north: 70.0, east: 60.0 },
{ name: "Ural + West Siberia", south: 45.0, west: 60.0, north: 70.0, east: 90.0 },
{ name: "East Siberia + Far East", south: 42.0, west: 90.0, north: 72.0, east: 190.0 },
],
HK: [
{ name: "Hong Kong", south: 22.15, west: 113.83, north: 22.56, east: 114.44 },
],
};
// Rate limit: 5 seconds between requests to be respectful
const RATE_LIMIT_MS = 5000;
let lastRequestTime = 0;
/**
* Delay helper for rate limiting
*/
async function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Enforce rate limiting between requests
*/
async function enforceRateLimit(): Promise<void> {
const now = Date.now();
const timeSinceLastRequest = now - lastRequestTime;
if (timeSinceLastRequest < RATE_LIMIT_MS) {
const waitTime = RATE_LIMIT_MS - timeSinceLastRequest;
console.log(`Rate limiting: waiting ${waitTime}ms...`);
await delay(waitTime);
}
lastRequestTime = Date.now();
}
/**
* Query Overpass API with failover support
*/
async function queryOverpass(query: string): Promise<any> {
await enforceRateLimit();
let lastError: Error | null = null;
// Try each endpoint in order
for (const endpoint of OVERPASS_ENDPOINTS) {
try {
console.log(`Querying Overpass API at ${endpoint}...`);
const response = await fetch(endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'NearestMass/1.0 (https://catholicmass.net; church data import)',
},
body: `data=${encodeURIComponent(query)}`,
});
if (response.status === 429) {
console.warn(`Rate limited by ${endpoint}, waiting 60 seconds...`);
await delay(60000);
// Retry this endpoint
const retryResponse = await fetch(endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'NearestMass/1.0 (https://catholicmass.net; church data import)',
},
body: `data=${encodeURIComponent(query)}`,
});
if (!retryResponse.ok) {
throw new Error(`HTTP ${retryResponse.status}: ${retryResponse.statusText}`);
}
return await retryResponse.json();
}
if (response.status === 504) {
throw new Error('Gateway timeout - query too complex, try bounding box approach');
}
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const data = await response.json();
console.log(`Successfully retrieved ${data.elements?.length || 0} elements`);
return data;
} catch (error) {
console.error(`Failed to query ${endpoint}:`, error);
lastError = error as Error;
// Continue to next endpoint
}
}
throw new Error(`All Overpass endpoints failed. Last error: ${lastError?.message}`);
}
/**
* Parse OSM element to OSMChurch object
*/
function parseOSMElement(element: any): OSMChurch | null {
const tags = element.tags || {};
// Must have a name
if (!tags.name) {
return null;
}
// Get coordinates (use center for ways and relations)
let lat: number;
let lng: number;
if (element.type === 'node') {
lat = element.lat;
lng = element.lon;
} else if (element.center) {
lat = element.center.lat;
lng = element.center.lon;
} else {
// Skip elements without coordinates
return null;
}
// Build OSM ID
const osmId = `${element.type}/${element.id}`;
// Extract address components
const address = tags['addr:street']
? `${tags['addr:housenumber'] || ''} ${tags['addr:street']}`.trim()
: undefined;
const city = tags['addr:city'];
const state = tags['addr:state'];
const zip = tags['addr:postcode'];
const country = tags['addr:country'];
// Phone (try multiple tags)
const phone = tags.phone || tags['contact:phone'];
// Website (try multiple tags)
const website = tags.website || tags['contact:website'];
// Diocese
const diocese = tags.diocese;
// Wheelchair accessibility
let wheelchairAccess: boolean | undefined;
if (tags.wheelchair === 'yes') {
wheelchairAccess = true;
} else if (tags.wheelchair === 'no') {
wheelchairAccess = false;
}
// Service times (mass schedule in opening_hours format)
const serviceTimes = tags.service_times || tags['service_times:catholic'] || undefined;
return {
osmId,
name: tags.name,
lat,
lng,
address,
city,
state,
zip,
country,
phone,
website,
diocese,
wheelchairAccess,
serviceTimes,
};
}
/**
* Query Overpass API for Catholic churches in a specific country
* @param countryCode ISO 3166-1 alpha-2 country code (e.g., "US", "MX", "BR")
* @returns Array of OSMChurch objects
*/
export async function queryOverpassByCountry(countryCode: string): Promise<OSMChurch[]> {
// Build Overpass QL query
const query = `
[out:json][timeout:300];
area["ISO3166-1"="${countryCode}"][admin_level=2]->.searchArea;
(
nwr["amenity"="place_of_worship"]["religion"="christian"]["denomination"="catholic"](area.searchArea);
nwr["amenity"="place_of_worship"]["religion"="christian"]["denomination"="roman_catholic"](area.searchArea);
);
out center;
`.trim();
console.log(`Querying Catholic churches in ${countryCode}...`);
try {
const data = await queryOverpass(query);
const churches: OSMChurch[] = [];
for (const element of data.elements || []) {
const church = parseOSMElement(element);
if (church) {
churches.push(church);
}
}
console.log(`Found ${churches.length} Catholic churches in ${countryCode}`);
return churches;
} catch (error) {
if ((error as Error).message.includes('Gateway timeout')) {
console.warn(`Query timeout for ${countryCode}, falling back to bounding box approach...`);
// Could implement bounding box fallback here if needed
throw error;
}
throw error;
}
}
/**
* Query Overpass API for Catholic churches in a bounding box
* Useful for large countries or when country area queries timeout
* @param south Southern latitude
* @param west Western longitude
* @param north Northern latitude
* @param east Eastern longitude
* @returns Array of OSMChurch objects
*/
export async function queryOverpassByBoundingBox(
south: number,
west: number,
north: number,
east: number
): Promise<OSMChurch[]> {
const query = `
[out:json][timeout:300];
(
nwr["amenity"="place_of_worship"]["religion"="christian"]["denomination"="catholic"](${south},${west},${north},${east});
nwr["amenity"="place_of_worship"]["religion"="christian"]["denomination"="roman_catholic"](${south},${west},${north},${east});
);
out center;
`.trim();
console.log(`Querying Catholic churches in bbox (${south},${west},${north},${east})...`);
const data = await queryOverpass(query);
const churches: OSMChurch[] = [];
for (const element of data.elements || []) {
const church = parseOSMElement(element);
if (church) {
churches.push(church);
}
}
console.log(`Found ${churches.length} Catholic churches in bounding box`);
return churches;
}
/**
* Query Overpass API for Catholic churches with automatic fallback to regional bounding boxes
* Tries country-level query first, falls back to regions on timeout
* @param countryCode ISO 3166-1 alpha-2 country code (e.g., "US", "MX", "BR")
* @returns Array of OSMChurch objects (deduplicated by osmId)
*/
export async function queryOverpassByCountryWithFallback(countryCode: string): Promise<OSMChurch[]> {
try {
// Try country-level query first
const churches = await queryOverpassByCountry(countryCode);
// If 0 results and we have bounding boxes, the country-level query may have
// silently failed (e.g. Swiss mirror returned 0 after primary timed out)
if (churches.length === 0 && COUNTRY_BOUNDING_BOXES[countryCode]) {
console.log(`Country query returned 0 results for ${countryCode}, retrying with bounding boxes...`);
throw new Error('Gateway timeout - query too complex, try bounding box approach');
}
return churches;
} catch (error) {
// Check if it's a timeout and we have bounding boxes for this country
if ((error as Error).message.includes('Gateway timeout')) {
const regions = COUNTRY_BOUNDING_BOXES[countryCode];
if (!regions) {
throw new Error(
`Gateway timeout for ${countryCode} and no bounding boxes defined. ` +
`Consider adding regional bounding boxes to COUNTRY_BOUNDING_BOXES in overpass-client.ts`
);
}
console.log(`Falling back to ${regions.length} regional queries for ${countryCode}...`);
const allChurches: OSMChurch[] = [];
const seenOsmIds = new Set<string>();
for (const region of regions) {
console.log(`\nQuerying region: ${region.name}`);
const regionChurches = await queryOverpassByBoundingBox(
region.south,
region.west,
region.north,
region.east
);
// Deduplicate by osmId (regions may overlap)
let newChurches = 0;
for (const church of regionChurches) {
if (!seenOsmIds.has(church.osmId)) {
seenOsmIds.add(church.osmId);
allChurches.push(church);
newChurches++;
}
}
console.log(`Added ${newChurches} new churches from ${region.name} (${regionChurches.length - newChurches} duplicates)`);
// Rate limiting already handled by queryOverpass, but add extra delay between regions
if (regions.indexOf(region) < regions.length - 1) {
await delay(2000);
}
}
console.log(`\nTotal churches found across all regions: ${allChurches.length}`);
return allChurches;
}
// Re-throw non-timeout errors
throw error;
}
}