feat: add discovermassId to church-matcher ExistingChurch and ChurchCandidate

Add discovermassId field to ExistingChurch interface and ChurchCandidate type,
insert a dedicated matching pass in findDuplicateChurch, and update all 15 importer
push blocks plus 16 loadExistingChurches select queries to include the new field.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
albertfj114
2026-03-11 06:52:05 -04:00
parent 2706708c51
commit a046928ed0
17 changed files with 11576 additions and 0 deletions

396
src/lib/church-matcher.ts Normal file
View File

@@ -0,0 +1,396 @@
/**
* Church matching and deduplication logic
* Used to avoid duplicate churches when importing from multiple sources (OSM, MassTimes, etc.)
*/
import { calculateDistance } from './geo';
import type { OSMChurch } from './overpass-client';
import type { BaiduChurch } from './baidu-client';
// Type for existing church from database
export interface ExistingChurch {
id: string;
name: string;
latitude: number;
longitude: number;
osmId: string | null;
baiduId: string | null;
masstimesId: string | null;
orarimesseId: string | null;
massSchedulesPhId: string | null;
philmassId: string | null;
horariosMisasId: string | null;
mszeInfoId: string | null;
weekdayMassesId: string | null;
messesInfoId: string | null;
bohosluzbyId: string | null;
miserendId: string | null;
kerknetId: string | null;
gottesdienstzeitenId: string | null;
discovermassId: string | null;
source: string;
website: string | null;
phone: string | null;
address: string | null;
country?: string;
}
// Maximum distance in km to consider churches as potential duplicates
const DUPLICATE_DISTANCE_KM = 0.2; // 200 meters
/**
* Normalize church name for comparison
* - Lowercase
* - Expand "St." to "Saint"
* - Remove common suffixes like "Catholic Church", "Parish", etc.
* - Remove punctuation
*/
function normalizeName(name: string): string {
return name
.toLowerCase()
.replace(/\bst\.\s/g, 'saint ')
.replace(/\bst\s/g, 'saint ')
.replace(/\bcatholic church\b/g, '')
.replace(/\bparish\b/g, '')
.replace(/\broman catholic\b/g, '')
.replace(/\bchurch\b/g, '')
.replace(/[^\w\s]/g, '') // Remove punctuation
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
}
/**
* Calculate Levenshtein distance between two strings
* Used for fuzzy name matching
*/
function levenshteinDistance(a: string, b: string): number {
const matrix: number[][] = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
/**
* Check if two normalized names are similar
* Returns true if they have a common substring of 5+ characters OR Levenshtein distance < 5
*/
function namesAreSimilar(name1: string, name2: string): boolean {
const normalized1 = normalizeName(name1);
const normalized2 = normalizeName(name2);
// Check for common substring of 5+ characters
const minLength = Math.min(normalized1.length, normalized2.length);
if (minLength >= 5) {
for (let i = 0; i <= normalized1.length - 5; i++) {
const substring = normalized1.substring(i, i + 5);
if (normalized2.includes(substring)) {
return true;
}
}
}
// Check Levenshtein distance
const distance = levenshteinDistance(normalized1, normalized2);
if (distance < 5) {
return true;
}
return false;
}
// Candidate type for deduplication — works with OSM, Baidu, or any source
export type ChurchCandidate = {
name: string;
lat: number;
lng: number;
osmId?: string;
baiduId?: string;
orarimesseId?: string;
massSchedulesPhId?: string;
philmassId?: string;
horariosMisasId?: string;
mszeInfoId?: string;
weekdayMassesId?: string;
messesInfoId?: string;
bohosluzbyId?: string;
miserendId?: string;
kerknetId?: string;
gottesdienstzeitenId?: string;
discovermassId?: string;
};
/**
* Find duplicate church in existing database
* Returns the best match or null if no duplicate found
*
* Matching strategy (in priority order):
* 1. Exact osmId match
* 2. Exact baiduId match
* 3-9. Exact importer ID matches (orarimesse, massSchedulesPh, philmass, horariosMisas, mszeInfo, weekdayMasses, messesInfo)
* 10. Proximity + name similarity (within 200m + similar name)
*/
export function findDuplicateChurch(
candidate: ChurchCandidate,
existingChurches: ExistingChurch[]
): ExistingChurch | null {
// First pass: exact osmId match
if (candidate.osmId) {
const osmMatch = existingChurches.find((church) => church.osmId === candidate.osmId);
if (osmMatch) {
return osmMatch;
}
}
// Second pass: exact baiduId match
if (candidate.baiduId) {
const baiduMatch = existingChurches.find((church) => church.baiduId === candidate.baiduId);
if (baiduMatch) {
return baiduMatch;
}
}
// Third pass: exact orarimesseId match
if (candidate.orarimesseId) {
const orarimesseMatch = existingChurches.find(
(church) => church.orarimesseId === candidate.orarimesseId
);
if (orarimesseMatch) return orarimesseMatch;
}
// Fourth pass: exact massSchedulesPhId match
if (candidate.massSchedulesPhId) {
const msphMatch = existingChurches.find(
(church) => church.massSchedulesPhId === candidate.massSchedulesPhId
);
if (msphMatch) return msphMatch;
}
// Fifth pass: exact philmassId match
if (candidate.philmassId) {
const philmassMatch = existingChurches.find(
(church) => church.philmassId === candidate.philmassId
);
if (philmassMatch) return philmassMatch;
}
// Sixth pass: exact horariosMisasId match
if (candidate.horariosMisasId) {
const horariosMisasMatch = existingChurches.find(
(church) => church.horariosMisasId === candidate.horariosMisasId
);
if (horariosMisasMatch) return horariosMisasMatch;
}
// Seventh pass: exact mszeInfoId match
if (candidate.mszeInfoId) {
const mszeInfoMatch = existingChurches.find(
(church) => church.mszeInfoId === candidate.mszeInfoId
);
if (mszeInfoMatch) return mszeInfoMatch;
}
// Eighth pass: exact weekdayMassesId match
if (candidate.weekdayMassesId) {
const weekdayMassesMatch = existingChurches.find(
(church) => church.weekdayMassesId === candidate.weekdayMassesId
);
if (weekdayMassesMatch) return weekdayMassesMatch;
}
// Ninth pass: exact messesInfoId match
if (candidate.messesInfoId) {
const messesInfoMatch = existingChurches.find(
(church) => church.messesInfoId === candidate.messesInfoId
);
if (messesInfoMatch) return messesInfoMatch;
}
// Tenth pass: exact bohosluzbyId match
if (candidate.bohosluzbyId) {
const bohosluzbyMatch = existingChurches.find(
(church) => church.bohosluzbyId === candidate.bohosluzbyId
);
if (bohosluzbyMatch) return bohosluzbyMatch;
}
// Eleventh pass: exact miserendId match
if (candidate.miserendId) {
const miserendMatch = existingChurches.find(
(church) => church.miserendId === candidate.miserendId
);
if (miserendMatch) return miserendMatch;
}
// Twelfth pass: exact kerknetId match
if (candidate.kerknetId) {
const kerknetMatch = existingChurches.find(
(church) => church.kerknetId === candidate.kerknetId
);
if (kerknetMatch) return kerknetMatch;
}
// Thirteenth pass: exact gottesdienstzeitenId match
if (candidate.gottesdienstzeitenId) {
const gdzMatch = existingChurches.find(
(church) => church.gottesdienstzeitenId === candidate.gottesdienstzeitenId
);
if (gdzMatch) return gdzMatch;
}
// Fourteenth pass: exact discovermassId match
if (candidate.discovermassId) {
const match = existingChurches.find(c => c.discovermassId === candidate.discovermassId);
if (match) return match;
}
// Fifteenth pass: proximity + name match (skip if candidate has no real coordinates)
if (candidate.lat === 0 && candidate.lng === 0) {
return null;
}
const nearbyChurches = existingChurches.filter((church) => {
const distance = calculateDistance(
{ lat: candidate.lat, lng: candidate.lng },
{ lat: church.latitude, lng: church.longitude }
);
return distance <= DUPLICATE_DISTANCE_KM;
});
if (nearbyChurches.length === 0) {
return null;
}
// Among nearby churches, find one with similar name
for (const church of nearbyChurches) {
if (namesAreSimilar(candidate.name, church.name)) {
return church;
}
}
return null;
}
/**
* Merge OSM data into existing church record
* Only overwrites fields that are null/empty in existing with non-null OSM data
*
* Rules:
* - Never overwrite: name (if existing has one), massSchedules, scraperConfig
* - Always update: osmId, osmLastSyncedAt, hasWebsite
* - Prefer existing data for: phone, address, website (if already populated)
* - Use OSM data for: phone, address, website (only if existing field is null)
*/
export function mergeChurchData(
existing: ExistingChurch,
osmData: OSMChurch
): Partial<ExistingChurch> & { osmId: string; osmLastSyncedAt: Date; hasWebsite: boolean } {
const merged: any = {
osmId: osmData.osmId,
osmLastSyncedAt: new Date(),
hasWebsite: !!osmData.website,
};
// Only update coordinates if they differ significantly (more than 50m)
const coordDistance = calculateDistance(
{ lat: existing.latitude, lng: existing.longitude },
{ lat: osmData.lat, lng: osmData.lng }
);
if (coordDistance > 0.05) {
merged.latitude = osmData.lat;
merged.longitude = osmData.lng;
}
// Update address fields only if existing is null
if (!existing.address && osmData.address) {
merged.address = osmData.address;
}
// Update phone only if existing is null
if (!existing.phone && osmData.phone) {
merged.phone = osmData.phone;
}
// Update website only if existing is null
if (!existing.website && osmData.website) {
merged.website = osmData.website;
}
// Update source to "osm" if currently "manual"
if (existing.source === 'manual') {
merged.source = 'osm';
}
return merged;
}
/**
* Merge Baidu Maps data into existing church record
* Similar to mergeChurchData but for Baidu source
*
* Rules:
* - Always set: baiduId, baiduLastSyncedAt
* - Prefer existing data for: phone, address, website (if already populated)
* - Use Baidu data only if existing field is null
*/
export function mergeBaiduData(
existing: ExistingChurch,
baiduData: BaiduChurch
): Record<string, unknown> {
const merged: Record<string, unknown> = {
baiduId: baiduData.baiduId,
baiduLastSyncedAt: new Date(),
};
// Only update coordinates if they differ significantly (more than 50m)
const coordDistance = calculateDistance(
{ lat: existing.latitude, lng: existing.longitude },
{ lat: baiduData.lat, lng: baiduData.lng }
);
if (coordDistance > 0.05) {
// Only update coords if existing has no osmId (OSM coords are more reliable)
if (!existing.osmId) {
merged.latitude = baiduData.lat;
merged.longitude = baiduData.lng;
}
}
// Update address only if existing is null
if (!existing.address && baiduData.address) {
merged.address = baiduData.address;
}
// Update phone only if existing is null
if (!existing.phone && baiduData.phone) {
merged.phone = baiduData.phone;
}
// Set city/province if not set
if (baiduData.city) {
merged.city = merged.city || baiduData.city;
}
if (baiduData.province) {
merged.state = merged.state || baiduData.province;
}
return merged;
}