normalizeName strips noise words (church/parish/chapel/etc), accents, and punctuation for robust name comparison. findMatch uses word-overlap Jaccard score (threshold 0.4) with address-prefix fallback for Chinese- named churches where English name overlap may be low. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
439 lines
16 KiB
TypeScript
439 lines
16 KiB
TypeScript
#!/usr/bin/env tsx
|
||
/**
|
||
* Import HK Diocese parish directory from plain-text paste.
|
||
* Usage: npx tsx scripts/import-hk-parishes.ts [--dry-run] [--file scripts/hk-parishes.txt]
|
||
*/
|
||
|
||
import dotenv from 'dotenv';
|
||
import path from 'path';
|
||
import fs from 'fs';
|
||
|
||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||
|
||
import { Pool } from 'pg';
|
||
import { PrismaPg } from '@prisma/adapter-pg';
|
||
import { PrismaClient } from '@prisma/client';
|
||
|
||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||
const pool = new Pool({
|
||
connectionString: dbUrl,
|
||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||
});
|
||
const adapter = new PrismaPg(pool);
|
||
const prisma = new PrismaClient({ adapter });
|
||
|
||
// ─── Types ────────────────────────────────────────────────────────────────────
|
||
|
||
export interface ParsedSchedule {
|
||
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
|
||
time: string; // "HH:MM"
|
||
language: string; // "English" | "Cantonese" | "Tagalog"
|
||
notes: string | null;
|
||
}
|
||
|
||
export interface ParsedEntry {
|
||
locationName: string;
|
||
parishName: string | null;
|
||
address: string | null;
|
||
phone: string | null;
|
||
email: string | null;
|
||
schedules: ParsedSchedule[];
|
||
}
|
||
|
||
interface ExistingChurch {
|
||
id: string;
|
||
name: string;
|
||
address: string | null;
|
||
phone: string | null;
|
||
email: string | null;
|
||
}
|
||
|
||
interface ImportStats {
|
||
matched: number;
|
||
created: number;
|
||
schedulesWritten: number;
|
||
skipped: number;
|
||
}
|
||
|
||
// ─── Parser ───────────────────────────────────────────────────────────────────
|
||
|
||
const ARTIFACT_LINES = new Set(['share', 'path', 'close', '']);
|
||
|
||
const LANG_PATTERN = /(Cantonese|English|Tagalog|Chinese)/i;
|
||
|
||
// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
|
||
|
||
/**
|
||
* Split raw file text into individual entry strings.
|
||
* Entries are delimited by "Path\nClose" which appears in every entry.
|
||
* The header segment ("HONG KONG CHURCHES\n\n...") before the first entry is discarded.
|
||
*/
|
||
export function splitEntries(raw: string): string[] {
|
||
const text = raw.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
||
const parts = text.split('\nPath\nClose\n');
|
||
const entries: string[] = [];
|
||
for (let i = 1; i < parts.length; i++) {
|
||
const pre = parts[i - 1];
|
||
const body = parts[i];
|
||
entries.push(pre + '\nPath\nClose\n' + body);
|
||
}
|
||
return entries;
|
||
}
|
||
|
||
/**
|
||
* Extract location name and parish name from the pre-marker text of an entry.
|
||
*/
|
||
export function extractNames(preMarker: string): { locationName: string; parishName: string | null } {
|
||
const lines = preMarker
|
||
.split('\n')
|
||
.map(l => l.trimEnd())
|
||
.filter(l => {
|
||
const lower = l.trim().toLowerCase();
|
||
return !ARTIFACT_LINES.has(lower) && !l.startsWith(' ');
|
||
})
|
||
.filter(l => l.trim().length > 0);
|
||
|
||
const nameLines = lines.slice(-2);
|
||
if (nameLines.length === 0) return { locationName: 'Unknown', parishName: null };
|
||
if (nameLines.length === 1) return { locationName: nameLines[0].trim(), parishName: null };
|
||
return {
|
||
locationName: nameLines[1].trim(),
|
||
parishName: nameLines[0].trim(),
|
||
};
|
||
}
|
||
|
||
// ─── Task 3: Field extractor ──────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Extract address, phone, email from the entry body (text after Path/Close).
|
||
* Full-width parentheses ( ) are normalised to ASCII ( ).
|
||
*/
|
||
export function extractFields(body: string): { address: string | null; phone: string | null; email: string | null } {
|
||
const normalise = (s: string) => s.replace(/(/g, '(').replace(/)/g, ')').trim();
|
||
|
||
function extractField(fieldName: string): string | null {
|
||
const regex = new RegExp(`\\b${fieldName}\\n([\\s\\S]*?)(?:\\n\\n|\\nFax|\\nEmail|\\nWebsite|\\nChurch|\\nParish|\\nAssistant|\\nDeacon|\\nSister|\\nChairperson|\\nResident|\\nRector|\\nP\\.C|\\nPastoral|\\nMass Time|$)`, 'i');
|
||
const m = body.match(regex);
|
||
if (!m) return null;
|
||
const value = m[1].replace(/\n/g, ' ').trim();
|
||
return value || null;
|
||
}
|
||
|
||
const address = extractField('Address');
|
||
const rawPhone = extractField('Phone');
|
||
const email = extractField('Email');
|
||
|
||
return {
|
||
address: address ? normalise(address) : null,
|
||
phone: rawPhone ? normalise(rawPhone) : null,
|
||
email: email || null,
|
||
};
|
||
}
|
||
|
||
// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Normalise a time string to "HH:MM" 24-hour format.
|
||
* Accepts: "8:00am", "11:30 am", "7:00 a.m.", "12:00 noon", etc.
|
||
* Returns null if no valid time found.
|
||
*/
|
||
export function normalizeTime(raw: string): string | null {
|
||
const s = raw.trim().toLowerCase();
|
||
if (s.includes('noon')) {
|
||
if (s === 'noon') return '12:00';
|
||
const m = s.match(/(\d{1,2}):(\d{2})\s*noon/);
|
||
if (m) return `${String(parseInt(m[1], 10)).padStart(2, '0')}:${m[2]}`;
|
||
}
|
||
|
||
const m = s.match(/(\d{1,2}):(\d{2})\s*(am|pm|a\.m\.|p\.m\.)/);
|
||
if (!m) return null;
|
||
|
||
let h = parseInt(m[1], 10);
|
||
const min = parseInt(m[2], 10);
|
||
const period = m[3].replace(/\./g, '').toLowerCase();
|
||
|
||
if (period === 'am') {
|
||
if (h === 12) h = 0;
|
||
} else {
|
||
if (h !== 12) h += 12;
|
||
}
|
||
|
||
return `${String(h).padStart(2, '0')}:${String(min).padStart(2, '0')}`;
|
||
}
|
||
|
||
// ─── Task 5: Schedule line parser ────────────────────────────────────────────
|
||
|
||
const CONDITIONAL_PATTERN = /^([\w\s]+(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|month)[^:]*:)\s*/i;
|
||
|
||
/**
|
||
* Parse a single schedule text line into 0-N ParsedSchedule records.
|
||
* dayOfWeek: the fixed day for this section (0=Sun, 6=Sat for Anticipated).
|
||
*/
|
||
export function parseScheduleLine(line: string, dayOfWeek: number): ParsedSchedule[] {
|
||
let remainder = line.trim();
|
||
let language = 'English';
|
||
let sectionNotes: string | null = null;
|
||
|
||
// Extract language tag (with or without closing paren)
|
||
const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
|
||
if (langMatch) {
|
||
const raw = langMatch[1].toLowerCase();
|
||
language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
|
||
: raw === 'tagalog' ? 'Tagalog'
|
||
: 'English';
|
||
remainder = remainder.replace(langMatch[0], '').trim();
|
||
}
|
||
|
||
// Strip "Saturday" / "on Saturday" anchors (Anticipated Sunday section)
|
||
remainder = remainder.replace(/\bSaturday\b/gi, '').replace(/\bon\b/gi, '').trim();
|
||
|
||
// Extract conditional note prefix
|
||
const condMatch = remainder.match(CONDITIONAL_PATTERN);
|
||
if (condMatch) {
|
||
sectionNotes = condMatch[1].replace(/:$/, '').trim();
|
||
remainder = remainder.slice(condMatch[0].length);
|
||
}
|
||
|
||
// Split by comma into time tokens
|
||
const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
|
||
const results: ParsedSchedule[] = [];
|
||
|
||
for (const token of tokens) {
|
||
const time = normalizeTime(token);
|
||
if (!time) continue;
|
||
|
||
// Anything in the token that isn't the time or period is a note
|
||
const noteText = token
|
||
.replace(/\d{1,2}:\d{2}\s*(am|pm|a\.m\.|p\.m\.|noon)/i, '')
|
||
.replace(/\s+/g, ' ')
|
||
.trim() || null;
|
||
|
||
results.push({
|
||
dayOfWeek,
|
||
time,
|
||
language,
|
||
notes: noteText || sectionNotes,
|
||
});
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
|
||
|
||
const DAY_ABBREV: Record<string, number> = {
|
||
mon: 1, tue: 2, wed: 3, thur: 4, thu: 4, fri: 5, sat: 6, sun: 0,
|
||
};
|
||
const DAY_FULL: Record<string, number> = {
|
||
monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6, sunday: 0,
|
||
};
|
||
|
||
function parseDays(prefix: string): number[] {
|
||
const s = prefix.toLowerCase().replace(/\./g, '').replace(/:/g, '').trim();
|
||
|
||
// Range: "monday to friday" or "mon to sat"
|
||
const rangeMatch = s.match(/(\w+)\s+to\s+(\w+)/);
|
||
if (rangeMatch) {
|
||
const fromDay = DAY_FULL[rangeMatch[1]] ?? DAY_ABBREV[rangeMatch[1]];
|
||
const toDay = DAY_FULL[rangeMatch[2]] ?? DAY_ABBREV[rangeMatch[2]];
|
||
if (fromDay !== undefined && toDay !== undefined) {
|
||
const days: number[] = [];
|
||
let d = fromDay;
|
||
while (d !== toDay) { days.push(d); d = (d + 1) % 7; }
|
||
days.push(toDay);
|
||
return days;
|
||
}
|
||
}
|
||
|
||
// List: "mon, tue, thur" or "tue & sat"
|
||
const tokens = s.split(/[,&\s]+/).map(t => t.trim()).filter(Boolean);
|
||
const days = tokens
|
||
.map(t => DAY_FULL[t] ?? DAY_ABBREV[t])
|
||
.filter((d): d is number => d !== undefined);
|
||
return [...new Set(days)];
|
||
}
|
||
|
||
// Matches a day-prefix at the start of a token (requires trailing space/colon)
|
||
const DAY_PREFIX_RE = /^((?:(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*(?:[,&]\s*(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*)*(?:to\s+\w+\.?\s*)?)|(?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+to\s+\w+)?))[\s:]+/i;
|
||
|
||
// Matches a token that is ONLY a day (or day list) with no time — e.g. "Mon." "Tue."
|
||
const PURE_DAY_RE = /^((?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?|(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\.?$/i;
|
||
|
||
/**
|
||
* Parse a weekday mass line that may have day prefixes.
|
||
* Algorithm: split by comma, process each token; track current days across tokens.
|
||
*/
|
||
export function parseWeekdayLine(line: string): ParsedSchedule[] {
|
||
let remainder = line.trim();
|
||
let language = 'English';
|
||
|
||
const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
|
||
if (langMatch) {
|
||
const raw = langMatch[1].toLowerCase();
|
||
language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
|
||
: raw === 'tagalog' ? 'Tagalog' : 'English';
|
||
remainder = remainder.replace(langMatch[0], '').replace(/\s*\(\s*$/, '').trim();
|
||
}
|
||
|
||
const results: ParsedSchedule[] = [];
|
||
const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
|
||
let currentDays: number[] = [1, 2, 3, 4, 5]; // default Mon–Fri
|
||
let accumulatedDays: number[] = []; // day-only tokens accumulate here until a time appears
|
||
|
||
for (const token of tokens) {
|
||
const prefixMatch = token.match(DAY_PREFIX_RE);
|
||
if (prefixMatch) {
|
||
const days = parseDays(prefixMatch[1]);
|
||
const timePart = token.slice(prefixMatch[0].length);
|
||
const time = normalizeTime(timePart);
|
||
if (time) {
|
||
// Merge any previously accumulated day-only tokens with this token's days
|
||
const mergedDays = accumulatedDays.length > 0
|
||
? [...new Set([...accumulatedDays, ...days])]
|
||
: days.length > 0 ? days : currentDays;
|
||
accumulatedDays = [];
|
||
if (mergedDays.length > 0) currentDays = mergedDays;
|
||
for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
|
||
} else {
|
||
// Day-only token via prefix match: accumulate
|
||
if (days.length > 0) accumulatedDays.push(...days);
|
||
}
|
||
} else if (PURE_DAY_RE.test(token)) {
|
||
// Pure day token like "Mon." "Tue." "Tuesday" — accumulate
|
||
const days = parseDays(token);
|
||
if (days.length > 0) accumulatedDays.push(...days);
|
||
} else {
|
||
const time = normalizeTime(token);
|
||
if (time) {
|
||
// Apply any accumulated days, then reset
|
||
if (accumulatedDays.length > 0) {
|
||
currentDays = [...new Set(accumulatedDays)];
|
||
accumulatedDays = [];
|
||
}
|
||
for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
|
||
}
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
// ─── Task 7: Full entry parser ────────────────────────────────────────────────
|
||
|
||
const SKIP_SECTIONS = new Set(['special masses', 'eucharist adoration']);
|
||
|
||
/**
|
||
* Parse a full raw entry string (including pre-marker names) into a ParsedEntry.
|
||
*/
|
||
export function parseEntry(raw: string): ParsedEntry {
|
||
const markerIdx = raw.indexOf('\nPath\nClose\n');
|
||
const pre = markerIdx >= 0 ? raw.slice(0, markerIdx) : '';
|
||
const body = markerIdx >= 0 ? raw.slice(markerIdx + '\nPath\nClose\n'.length) : raw;
|
||
|
||
const { locationName, parishName } = extractNames(pre);
|
||
const { address, phone, email } = extractFields(body);
|
||
|
||
const schedules: ParsedSchedule[] = [];
|
||
|
||
const massSectionMatch = body.match(/Mass Time\n([\s\S]*?)(?:Share\n|$)/i);
|
||
if (massSectionMatch) {
|
||
const massText = massSectionMatch[1];
|
||
const lines = massText.split('\n');
|
||
let currentSection: string | null = null;
|
||
|
||
for (const line of lines) {
|
||
const trimmed = line.trim();
|
||
if (!trimmed) continue;
|
||
|
||
const lower = trimmed.toLowerCase();
|
||
|
||
if (lower === 'sunday masses') { currentSection = 'sunday'; continue; }
|
||
if (lower === 'anticipated sunday masses') { currentSection = 'anticipated'; continue; }
|
||
if (lower === 'weekday masses') { currentSection = 'weekday'; continue; }
|
||
if (SKIP_SECTIONS.has(lower)) { currentSection = 'skip'; continue; }
|
||
|
||
if (currentSection === 'skip') continue;
|
||
if (currentSection === null) continue;
|
||
|
||
if (currentSection === 'sunday') {
|
||
schedules.push(...parseScheduleLine(trimmed, 0));
|
||
} else if (currentSection === 'anticipated') {
|
||
schedules.push(...parseScheduleLine(trimmed, 6));
|
||
} else if (currentSection === 'weekday') {
|
||
schedules.push(...parseWeekdayLine(trimmed));
|
||
}
|
||
}
|
||
}
|
||
|
||
return { locationName, parishName, address, phone, email, schedules };
|
||
}
|
||
|
||
// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
|
||
|
||
const NOISE_WORDS = new Set([
|
||
'church', 'parish', 'chapel', 'centre', 'center', 'mass',
|
||
'saint', 'st', 'our', 'lady', 'of', 'the', 'a', 'an',
|
||
]);
|
||
|
||
/**
|
||
* Normalise a church name for comparison:
|
||
* lowercase, strip accents, remove noise words, collapse whitespace.
|
||
*/
|
||
export function normalizeName(name: string): string {
|
||
return name
|
||
.toLowerCase()
|
||
.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
|
||
.replace(/[^a-z0-9\s]/g, ' ')
|
||
.split(/\s+/)
|
||
.filter(w => w.length >= 2 && !NOISE_WORDS.has(w))
|
||
.join(' ')
|
||
.trim();
|
||
}
|
||
|
||
function wordOverlap(a: string, b: string): number {
|
||
const setA = new Set(a.split(' ').filter(Boolean));
|
||
const setB = new Set(b.split(' ').filter(Boolean));
|
||
if (setA.size === 0 || setB.size === 0) return 0;
|
||
let common = 0;
|
||
for (const w of setA) if (setB.has(w)) common++;
|
||
return common / Math.max(setA.size, setB.size);
|
||
}
|
||
|
||
/**
|
||
* Find the best-matching existing church for a parsed entry.
|
||
* Returns null if no match meets the threshold.
|
||
*/
|
||
export function findMatch(
|
||
locationName: string,
|
||
address: string | null,
|
||
existing: ExistingChurch[]
|
||
): ExistingChurch | null {
|
||
const normTarget = normalizeName(locationName);
|
||
let best: ExistingChurch | null = null;
|
||
let bestScore = 0;
|
||
|
||
for (const church of existing) {
|
||
const normExisting = normalizeName(church.name);
|
||
const score = wordOverlap(normTarget, normExisting);
|
||
|
||
if (score > bestScore) {
|
||
bestScore = score;
|
||
best = church;
|
||
}
|
||
}
|
||
|
||
if (bestScore >= 0.4) return best;
|
||
|
||
// Fallback: address prefix match (first 12 chars)
|
||
if (address && address.length >= 5) {
|
||
const addrPrefix = address.slice(0, 12).toLowerCase();
|
||
for (const church of existing) {
|
||
if (church.address?.toLowerCase().includes(addrPrefix)) return church;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|