feat: add discovermassId to church-matcher ExistingChurch and ChurchCandidate
Add discovermassId field to ExistingChurch interface and ChurchCandidate type, insert a dedicated matching pass in findDuplicateChurch, and update all 15 importer push blocks plus 16 loadExistingChurches select queries to include the new field. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
396
src/lib/church-matcher.ts
Normal file
396
src/lib/church-matcher.ts
Normal file
@@ -0,0 +1,396 @@
|
||||
/**
|
||||
* Church matching and deduplication logic
|
||||
* Used to avoid duplicate churches when importing from multiple sources (OSM, MassTimes, etc.)
|
||||
*/
|
||||
|
||||
import { calculateDistance } from './geo';
|
||||
import type { OSMChurch } from './overpass-client';
|
||||
import type { BaiduChurch } from './baidu-client';
|
||||
|
||||
// Type for existing church from database
|
||||
export interface ExistingChurch {
|
||||
id: string;
|
||||
name: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
osmId: string | null;
|
||||
baiduId: string | null;
|
||||
masstimesId: string | null;
|
||||
orarimesseId: string | null;
|
||||
massSchedulesPhId: string | null;
|
||||
philmassId: string | null;
|
||||
horariosMisasId: string | null;
|
||||
mszeInfoId: string | null;
|
||||
weekdayMassesId: string | null;
|
||||
messesInfoId: string | null;
|
||||
bohosluzbyId: string | null;
|
||||
miserendId: string | null;
|
||||
kerknetId: string | null;
|
||||
gottesdienstzeitenId: string | null;
|
||||
discovermassId: string | null;
|
||||
source: string;
|
||||
website: string | null;
|
||||
phone: string | null;
|
||||
address: string | null;
|
||||
country?: string;
|
||||
}
|
||||
|
||||
// Maximum distance in km to consider churches as potential duplicates
|
||||
const DUPLICATE_DISTANCE_KM = 0.2; // 200 meters
|
||||
|
||||
/**
|
||||
* Normalize church name for comparison
|
||||
* - Lowercase
|
||||
* - Expand "St." to "Saint"
|
||||
* - Remove common suffixes like "Catholic Church", "Parish", etc.
|
||||
* - Remove punctuation
|
||||
*/
|
||||
function normalizeName(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/\bst\.\s/g, 'saint ')
|
||||
.replace(/\bst\s/g, 'saint ')
|
||||
.replace(/\bcatholic church\b/g, '')
|
||||
.replace(/\bparish\b/g, '')
|
||||
.replace(/\broman catholic\b/g, '')
|
||||
.replace(/\bchurch\b/g, '')
|
||||
.replace(/[^\w\s]/g, '') // Remove punctuation
|
||||
.replace(/\s+/g, ' ') // Normalize whitespace
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate Levenshtein distance between two strings
|
||||
* Used for fuzzy name matching
|
||||
*/
|
||||
function levenshteinDistance(a: string, b: string): number {
|
||||
const matrix: number[][] = [];
|
||||
|
||||
for (let i = 0; i <= b.length; i++) {
|
||||
matrix[i] = [i];
|
||||
}
|
||||
|
||||
for (let j = 0; j <= a.length; j++) {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
|
||||
for (let i = 1; i <= b.length; i++) {
|
||||
for (let j = 1; j <= a.length; j++) {
|
||||
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||
matrix[i][j] = matrix[i - 1][j - 1];
|
||||
} else {
|
||||
matrix[i][j] = Math.min(
|
||||
matrix[i - 1][j - 1] + 1, // substitution
|
||||
matrix[i][j - 1] + 1, // insertion
|
||||
matrix[i - 1][j] + 1 // deletion
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[b.length][a.length];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if two normalized names are similar
|
||||
* Returns true if they have a common substring of 5+ characters OR Levenshtein distance < 5
|
||||
*/
|
||||
function namesAreSimilar(name1: string, name2: string): boolean {
|
||||
const normalized1 = normalizeName(name1);
|
||||
const normalized2 = normalizeName(name2);
|
||||
|
||||
// Check for common substring of 5+ characters
|
||||
const minLength = Math.min(normalized1.length, normalized2.length);
|
||||
if (minLength >= 5) {
|
||||
for (let i = 0; i <= normalized1.length - 5; i++) {
|
||||
const substring = normalized1.substring(i, i + 5);
|
||||
if (normalized2.includes(substring)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check Levenshtein distance
|
||||
const distance = levenshteinDistance(normalized1, normalized2);
|
||||
if (distance < 5) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Candidate type for deduplication — works with OSM, Baidu, or any source
|
||||
export type ChurchCandidate = {
|
||||
name: string;
|
||||
lat: number;
|
||||
lng: number;
|
||||
osmId?: string;
|
||||
baiduId?: string;
|
||||
orarimesseId?: string;
|
||||
massSchedulesPhId?: string;
|
||||
philmassId?: string;
|
||||
horariosMisasId?: string;
|
||||
mszeInfoId?: string;
|
||||
weekdayMassesId?: string;
|
||||
messesInfoId?: string;
|
||||
bohosluzbyId?: string;
|
||||
miserendId?: string;
|
||||
kerknetId?: string;
|
||||
gottesdienstzeitenId?: string;
|
||||
discovermassId?: string;
|
||||
};
|
||||
|
||||
/**
|
||||
* Find duplicate church in existing database
|
||||
* Returns the best match or null if no duplicate found
|
||||
*
|
||||
* Matching strategy (in priority order):
|
||||
* 1. Exact osmId match
|
||||
* 2. Exact baiduId match
|
||||
* 3-9. Exact importer ID matches (orarimesse, massSchedulesPh, philmass, horariosMisas, mszeInfo, weekdayMasses, messesInfo)
|
||||
* 10. Proximity + name similarity (within 200m + similar name)
|
||||
*/
|
||||
export function findDuplicateChurch(
|
||||
candidate: ChurchCandidate,
|
||||
existingChurches: ExistingChurch[]
|
||||
): ExistingChurch | null {
|
||||
// First pass: exact osmId match
|
||||
if (candidate.osmId) {
|
||||
const osmMatch = existingChurches.find((church) => church.osmId === candidate.osmId);
|
||||
if (osmMatch) {
|
||||
return osmMatch;
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: exact baiduId match
|
||||
if (candidate.baiduId) {
|
||||
const baiduMatch = existingChurches.find((church) => church.baiduId === candidate.baiduId);
|
||||
if (baiduMatch) {
|
||||
return baiduMatch;
|
||||
}
|
||||
}
|
||||
|
||||
// Third pass: exact orarimesseId match
|
||||
if (candidate.orarimesseId) {
|
||||
const orarimesseMatch = existingChurches.find(
|
||||
(church) => church.orarimesseId === candidate.orarimesseId
|
||||
);
|
||||
if (orarimesseMatch) return orarimesseMatch;
|
||||
}
|
||||
|
||||
// Fourth pass: exact massSchedulesPhId match
|
||||
if (candidate.massSchedulesPhId) {
|
||||
const msphMatch = existingChurches.find(
|
||||
(church) => church.massSchedulesPhId === candidate.massSchedulesPhId
|
||||
);
|
||||
if (msphMatch) return msphMatch;
|
||||
}
|
||||
|
||||
// Fifth pass: exact philmassId match
|
||||
if (candidate.philmassId) {
|
||||
const philmassMatch = existingChurches.find(
|
||||
(church) => church.philmassId === candidate.philmassId
|
||||
);
|
||||
if (philmassMatch) return philmassMatch;
|
||||
}
|
||||
|
||||
// Sixth pass: exact horariosMisasId match
|
||||
if (candidate.horariosMisasId) {
|
||||
const horariosMisasMatch = existingChurches.find(
|
||||
(church) => church.horariosMisasId === candidate.horariosMisasId
|
||||
);
|
||||
if (horariosMisasMatch) return horariosMisasMatch;
|
||||
}
|
||||
|
||||
// Seventh pass: exact mszeInfoId match
|
||||
if (candidate.mszeInfoId) {
|
||||
const mszeInfoMatch = existingChurches.find(
|
||||
(church) => church.mszeInfoId === candidate.mszeInfoId
|
||||
);
|
||||
if (mszeInfoMatch) return mszeInfoMatch;
|
||||
}
|
||||
|
||||
// Eighth pass: exact weekdayMassesId match
|
||||
if (candidate.weekdayMassesId) {
|
||||
const weekdayMassesMatch = existingChurches.find(
|
||||
(church) => church.weekdayMassesId === candidate.weekdayMassesId
|
||||
);
|
||||
if (weekdayMassesMatch) return weekdayMassesMatch;
|
||||
}
|
||||
|
||||
// Ninth pass: exact messesInfoId match
|
||||
if (candidate.messesInfoId) {
|
||||
const messesInfoMatch = existingChurches.find(
|
||||
(church) => church.messesInfoId === candidate.messesInfoId
|
||||
);
|
||||
if (messesInfoMatch) return messesInfoMatch;
|
||||
}
|
||||
|
||||
// Tenth pass: exact bohosluzbyId match
|
||||
if (candidate.bohosluzbyId) {
|
||||
const bohosluzbyMatch = existingChurches.find(
|
||||
(church) => church.bohosluzbyId === candidate.bohosluzbyId
|
||||
);
|
||||
if (bohosluzbyMatch) return bohosluzbyMatch;
|
||||
}
|
||||
|
||||
// Eleventh pass: exact miserendId match
|
||||
if (candidate.miserendId) {
|
||||
const miserendMatch = existingChurches.find(
|
||||
(church) => church.miserendId === candidate.miserendId
|
||||
);
|
||||
if (miserendMatch) return miserendMatch;
|
||||
}
|
||||
|
||||
// Twelfth pass: exact kerknetId match
|
||||
if (candidate.kerknetId) {
|
||||
const kerknetMatch = existingChurches.find(
|
||||
(church) => church.kerknetId === candidate.kerknetId
|
||||
);
|
||||
if (kerknetMatch) return kerknetMatch;
|
||||
}
|
||||
|
||||
// Thirteenth pass: exact gottesdienstzeitenId match
|
||||
if (candidate.gottesdienstzeitenId) {
|
||||
const gdzMatch = existingChurches.find(
|
||||
(church) => church.gottesdienstzeitenId === candidate.gottesdienstzeitenId
|
||||
);
|
||||
if (gdzMatch) return gdzMatch;
|
||||
}
|
||||
|
||||
// Fourteenth pass: exact discovermassId match
|
||||
if (candidate.discovermassId) {
|
||||
const match = existingChurches.find(c => c.discovermassId === candidate.discovermassId);
|
||||
if (match) return match;
|
||||
}
|
||||
|
||||
// Fifteenth pass: proximity + name match (skip if candidate has no real coordinates)
|
||||
if (candidate.lat === 0 && candidate.lng === 0) {
|
||||
return null;
|
||||
}
|
||||
const nearbyChurches = existingChurches.filter((church) => {
|
||||
const distance = calculateDistance(
|
||||
{ lat: candidate.lat, lng: candidate.lng },
|
||||
{ lat: church.latitude, lng: church.longitude }
|
||||
);
|
||||
return distance <= DUPLICATE_DISTANCE_KM;
|
||||
});
|
||||
|
||||
if (nearbyChurches.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Among nearby churches, find one with similar name
|
||||
for (const church of nearbyChurches) {
|
||||
if (namesAreSimilar(candidate.name, church.name)) {
|
||||
return church;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge OSM data into existing church record
|
||||
* Only overwrites fields that are null/empty in existing with non-null OSM data
|
||||
*
|
||||
* Rules:
|
||||
* - Never overwrite: name (if existing has one), massSchedules, scraperConfig
|
||||
* - Always update: osmId, osmLastSyncedAt, hasWebsite
|
||||
* - Prefer existing data for: phone, address, website (if already populated)
|
||||
* - Use OSM data for: phone, address, website (only if existing field is null)
|
||||
*/
|
||||
export function mergeChurchData(
|
||||
existing: ExistingChurch,
|
||||
osmData: OSMChurch
|
||||
): Partial<ExistingChurch> & { osmId: string; osmLastSyncedAt: Date; hasWebsite: boolean } {
|
||||
const merged: any = {
|
||||
osmId: osmData.osmId,
|
||||
osmLastSyncedAt: new Date(),
|
||||
hasWebsite: !!osmData.website,
|
||||
};
|
||||
|
||||
// Only update coordinates if they differ significantly (more than 50m)
|
||||
const coordDistance = calculateDistance(
|
||||
{ lat: existing.latitude, lng: existing.longitude },
|
||||
{ lat: osmData.lat, lng: osmData.lng }
|
||||
);
|
||||
if (coordDistance > 0.05) {
|
||||
merged.latitude = osmData.lat;
|
||||
merged.longitude = osmData.lng;
|
||||
}
|
||||
|
||||
// Update address fields only if existing is null
|
||||
if (!existing.address && osmData.address) {
|
||||
merged.address = osmData.address;
|
||||
}
|
||||
|
||||
// Update phone only if existing is null
|
||||
if (!existing.phone && osmData.phone) {
|
||||
merged.phone = osmData.phone;
|
||||
}
|
||||
|
||||
// Update website only if existing is null
|
||||
if (!existing.website && osmData.website) {
|
||||
merged.website = osmData.website;
|
||||
}
|
||||
|
||||
// Update source to "osm" if currently "manual"
|
||||
if (existing.source === 'manual') {
|
||||
merged.source = 'osm';
|
||||
}
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge Baidu Maps data into existing church record
|
||||
* Similar to mergeChurchData but for Baidu source
|
||||
*
|
||||
* Rules:
|
||||
* - Always set: baiduId, baiduLastSyncedAt
|
||||
* - Prefer existing data for: phone, address, website (if already populated)
|
||||
* - Use Baidu data only if existing field is null
|
||||
*/
|
||||
export function mergeBaiduData(
|
||||
existing: ExistingChurch,
|
||||
baiduData: BaiduChurch
|
||||
): Record<string, unknown> {
|
||||
const merged: Record<string, unknown> = {
|
||||
baiduId: baiduData.baiduId,
|
||||
baiduLastSyncedAt: new Date(),
|
||||
};
|
||||
|
||||
// Only update coordinates if they differ significantly (more than 50m)
|
||||
const coordDistance = calculateDistance(
|
||||
{ lat: existing.latitude, lng: existing.longitude },
|
||||
{ lat: baiduData.lat, lng: baiduData.lng }
|
||||
);
|
||||
if (coordDistance > 0.05) {
|
||||
// Only update coords if existing has no osmId (OSM coords are more reliable)
|
||||
if (!existing.osmId) {
|
||||
merged.latitude = baiduData.lat;
|
||||
merged.longitude = baiduData.lng;
|
||||
}
|
||||
}
|
||||
|
||||
// Update address only if existing is null
|
||||
if (!existing.address && baiduData.address) {
|
||||
merged.address = baiduData.address;
|
||||
}
|
||||
|
||||
// Update phone only if existing is null
|
||||
if (!existing.phone && baiduData.phone) {
|
||||
merged.phone = baiduData.phone;
|
||||
}
|
||||
|
||||
// Set city/province if not set
|
||||
if (baiduData.city) {
|
||||
merged.city = merged.city || baiduData.city;
|
||||
}
|
||||
if (baiduData.province) {
|
||||
merged.state = merged.state || baiduData.province;
|
||||
}
|
||||
|
||||
return merged;
|
||||
}
|
||||
Reference in New Issue
Block a user