Files
ScraperControl/scripts/import-hk-parishes.ts
albertfj114 92265cf27f feat: add DB operations and CLI wiring for HK parish import
upsertChurch() handles matched churches (replace schedules atomically
via $transaction, update contact fields if null) and new churches
(create with source='diocese-hk', lat/lng=0 for later geocoding).
main() wires up CLI args, file reading, matching loop, and summary.
Guards main() call with ESM import.meta.url check to prevent execution
on import during tests.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 16:27:02 -04:00

585 lines
20 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env tsx
/**
* Import HK Diocese parish directory from plain-text paste.
* Usage: npx tsx scripts/import-hk-parishes.ts [--dry-run] [--file scripts/hk-parishes.txt]
*/
import dotenv from 'dotenv';
import path from 'path';
import fs from 'fs';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
// ─── Types ────────────────────────────────────────────────────────────────────
export interface ParsedSchedule {
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
time: string; // "HH:MM"
language: string; // "English" | "Cantonese" | "Tagalog"
notes: string | null;
}
export interface ParsedEntry {
locationName: string;
parishName: string | null;
address: string | null;
phone: string | null;
email: string | null;
schedules: ParsedSchedule[];
}
interface ExistingChurch {
id: string;
name: string;
address: string | null;
phone: string | null;
email: string | null;
}
interface ImportStats {
matched: number;
created: number;
schedulesWritten: number;
skipped: number;
}
// ─── Parser ───────────────────────────────────────────────────────────────────
const ARTIFACT_LINES = new Set(['share', 'path', 'close', '']);
const LANG_PATTERN = /(Cantonese|English|Tagalog|Chinese)/i;
// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
/**
* Split raw file text into individual entry strings.
* Entries are delimited by "Path\nClose" which appears in every entry.
* The header segment ("HONG KONG CHURCHES\n\n...") before the first entry is discarded.
*/
export function splitEntries(raw: string): string[] {
const text = raw.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
const parts = text.split('\nPath\nClose\n');
const entries: string[] = [];
for (let i = 1; i < parts.length; i++) {
const pre = parts[i - 1];
const body = parts[i];
entries.push(pre + '\nPath\nClose\n' + body);
}
return entries;
}
/**
* Extract location name and parish name from the pre-marker text of an entry.
*/
export function extractNames(preMarker: string): { locationName: string; parishName: string | null } {
const lines = preMarker
.split('\n')
.map(l => l.trimEnd())
.filter(l => {
const lower = l.trim().toLowerCase();
return !ARTIFACT_LINES.has(lower) && !l.startsWith(' ');
})
.filter(l => l.trim().length > 0);
const nameLines = lines.slice(-2);
if (nameLines.length === 0) return { locationName: 'Unknown', parishName: null };
if (nameLines.length === 1) return { locationName: nameLines[0].trim(), parishName: null };
return {
locationName: nameLines[1].trim(),
parishName: nameLines[0].trim(),
};
}
// ─── Task 3: Field extractor ──────────────────────────────────────────────────
/**
* Extract address, phone, email from the entry body (text after Path/Close).
* Full-width parentheses are normalised to ASCII ( ).
*/
export function extractFields(body: string): { address: string | null; phone: string | null; email: string | null } {
const normalise = (s: string) => s.replace(//g, '(').replace(//g, ')').trim();
function extractField(fieldName: string): string | null {
const regex = new RegExp(`\\b${fieldName}\\n([\\s\\S]*?)(?:\\n\\n|\\nFax|\\nEmail|\\nWebsite|\\nChurch|\\nParish|\\nAssistant|\\nDeacon|\\nSister|\\nChairperson|\\nResident|\\nRector|\\nP\\.C|\\nPastoral|\\nMass Time|$)`, 'i');
const m = body.match(regex);
if (!m) return null;
const value = m[1].replace(/\n/g, ' ').trim();
return value || null;
}
const address = extractField('Address');
const rawPhone = extractField('Phone');
const email = extractField('Email');
return {
address: address ? normalise(address) : null,
phone: rawPhone ? normalise(rawPhone) : null,
email: email || null,
};
}
// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
/**
* Normalise a time string to "HH:MM" 24-hour format.
* Accepts: "8:00am", "11:30 am", "7:00 a.m.", "12:00 noon", etc.
* Returns null if no valid time found.
*/
export function normalizeTime(raw: string): string | null {
const s = raw.trim().toLowerCase();
if (s.includes('noon')) {
if (s === 'noon') return '12:00';
const m = s.match(/(\d{1,2}):(\d{2})\s*noon/);
if (m) return `${String(parseInt(m[1], 10)).padStart(2, '0')}:${m[2]}`;
}
const m = s.match(/(\d{1,2}):(\d{2})\s*(am|pm|a\.m\.|p\.m\.)/);
if (!m) return null;
let h = parseInt(m[1], 10);
const min = parseInt(m[2], 10);
const period = m[3].replace(/\./g, '').toLowerCase();
if (period === 'am') {
if (h === 12) h = 0;
} else {
if (h !== 12) h += 12;
}
return `${String(h).padStart(2, '0')}:${String(min).padStart(2, '0')}`;
}
// ─── Task 5: Schedule line parser ────────────────────────────────────────────
const CONDITIONAL_PATTERN = /^([\w\s]+(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|month)[^:]*:)\s*/i;
/**
* Parse a single schedule text line into 0-N ParsedSchedule records.
* dayOfWeek: the fixed day for this section (0=Sun, 6=Sat for Anticipated).
*/
export function parseScheduleLine(line: string, dayOfWeek: number): ParsedSchedule[] {
let remainder = line.trim();
let language = 'English';
let sectionNotes: string | null = null;
// Extract language tag (with or without closing paren)
const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
if (langMatch) {
const raw = langMatch[1].toLowerCase();
language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
: raw === 'tagalog' ? 'Tagalog'
: 'English';
remainder = remainder.replace(langMatch[0], '').trim();
}
// Strip "Saturday" / "on Saturday" anchors (Anticipated Sunday section)
remainder = remainder.replace(/\bSaturday\b/gi, '').replace(/\bon\b/gi, '').trim();
// Extract conditional note prefix
const condMatch = remainder.match(CONDITIONAL_PATTERN);
if (condMatch) {
sectionNotes = condMatch[1].replace(/:$/, '').trim();
remainder = remainder.slice(condMatch[0].length);
}
// Split by comma into time tokens
const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
const results: ParsedSchedule[] = [];
for (const token of tokens) {
const time = normalizeTime(token);
if (!time) continue;
// Anything in the token that isn't the time or period is a note
const noteText = token
.replace(/\d{1,2}:\d{2}\s*(am|pm|a\.m\.|p\.m\.|noon)/i, '')
.replace(/\s+/g, ' ')
.trim() || null;
results.push({
dayOfWeek,
time,
language,
notes: noteText || sectionNotes,
});
}
return results;
}
// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
const DAY_ABBREV: Record<string, number> = {
mon: 1, tue: 2, wed: 3, thur: 4, thu: 4, fri: 5, sat: 6, sun: 0,
};
const DAY_FULL: Record<string, number> = {
monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6, sunday: 0,
};
function parseDays(prefix: string): number[] {
const s = prefix.toLowerCase().replace(/\./g, '').replace(/:/g, '').trim();
// Range: "monday to friday" or "mon to sat"
const rangeMatch = s.match(/(\w+)\s+to\s+(\w+)/);
if (rangeMatch) {
const fromDay = DAY_FULL[rangeMatch[1]] ?? DAY_ABBREV[rangeMatch[1]];
const toDay = DAY_FULL[rangeMatch[2]] ?? DAY_ABBREV[rangeMatch[2]];
if (fromDay !== undefined && toDay !== undefined) {
const days: number[] = [];
let d = fromDay;
while (d !== toDay) { days.push(d); d = (d + 1) % 7; }
days.push(toDay);
return days;
}
}
// List: "mon, tue, thur" or "tue & sat"
const tokens = s.split(/[,&\s]+/).map(t => t.trim()).filter(Boolean);
const days = tokens
.map(t => DAY_FULL[t] ?? DAY_ABBREV[t])
.filter((d): d is number => d !== undefined);
return [...new Set(days)];
}
// Matches a day-prefix at the start of a token (requires trailing space/colon)
const DAY_PREFIX_RE = /^((?:(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*(?:[,&]\s*(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*)*(?:to\s+\w+\.?\s*)?)|(?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+to\s+\w+)?))[\s:]+/i;
// Matches a token that is ONLY a day (or day list) with no time — e.g. "Mon." "Tue."
const PURE_DAY_RE = /^((?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?|(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\.?$/i;
/**
* Parse a weekday mass line that may have day prefixes.
* Algorithm: split by comma, process each token; track current days across tokens.
*/
export function parseWeekdayLine(line: string): ParsedSchedule[] {
let remainder = line.trim();
let language = 'English';
const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
if (langMatch) {
const raw = langMatch[1].toLowerCase();
language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
: raw === 'tagalog' ? 'Tagalog' : 'English';
remainder = remainder.replace(langMatch[0], '').replace(/\s*\(\s*$/, '').trim();
}
const results: ParsedSchedule[] = [];
const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
let currentDays: number[] = [1, 2, 3, 4, 5]; // default MonFri
let accumulatedDays: number[] = []; // day-only tokens accumulate here until a time appears
for (const token of tokens) {
const prefixMatch = token.match(DAY_PREFIX_RE);
if (prefixMatch) {
const days = parseDays(prefixMatch[1]);
const timePart = token.slice(prefixMatch[0].length);
const time = normalizeTime(timePart);
if (time) {
// Merge any previously accumulated day-only tokens with this token's days
const mergedDays = accumulatedDays.length > 0
? [...new Set([...accumulatedDays, ...days])]
: days.length > 0 ? days : currentDays;
accumulatedDays = [];
if (mergedDays.length > 0) currentDays = mergedDays;
for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
} else {
// Day-only token via prefix match: accumulate
if (days.length > 0) accumulatedDays.push(...days);
}
} else if (PURE_DAY_RE.test(token)) {
// Pure day token like "Mon." "Tue." "Tuesday" — accumulate
const days = parseDays(token);
if (days.length > 0) accumulatedDays.push(...days);
} else {
const time = normalizeTime(token);
if (time) {
// Apply any accumulated days, then reset
if (accumulatedDays.length > 0) {
currentDays = [...new Set(accumulatedDays)];
accumulatedDays = [];
}
for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
}
}
}
return results;
}
// ─── Task 7: Full entry parser ────────────────────────────────────────────────
const SKIP_SECTIONS = new Set(['special masses', 'eucharist adoration']);
/**
* Parse a full raw entry string (including pre-marker names) into a ParsedEntry.
*/
export function parseEntry(raw: string): ParsedEntry {
const markerIdx = raw.indexOf('\nPath\nClose\n');
const pre = markerIdx >= 0 ? raw.slice(0, markerIdx) : '';
const body = markerIdx >= 0 ? raw.slice(markerIdx + '\nPath\nClose\n'.length) : raw;
const { locationName, parishName } = extractNames(pre);
const { address, phone, email } = extractFields(body);
const schedules: ParsedSchedule[] = [];
const massSectionMatch = body.match(/Mass Time\n([\s\S]*?)(?:Share\n|$)/i);
if (massSectionMatch) {
const massText = massSectionMatch[1];
const lines = massText.split('\n');
let currentSection: string | null = null;
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
const lower = trimmed.toLowerCase();
if (lower === 'sunday masses') { currentSection = 'sunday'; continue; }
if (lower === 'anticipated sunday masses') { currentSection = 'anticipated'; continue; }
if (lower === 'weekday masses') { currentSection = 'weekday'; continue; }
if (SKIP_SECTIONS.has(lower)) { currentSection = 'skip'; continue; }
if (currentSection === 'skip') continue;
if (currentSection === null) continue;
if (currentSection === 'sunday') {
schedules.push(...parseScheduleLine(trimmed, 0));
} else if (currentSection === 'anticipated') {
schedules.push(...parseScheduleLine(trimmed, 6));
} else if (currentSection === 'weekday') {
schedules.push(...parseWeekdayLine(trimmed));
}
}
}
return { locationName, parishName, address, phone, email, schedules };
}
// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
const NOISE_WORDS = new Set([
'church', 'parish', 'chapel', 'centre', 'center', 'mass',
'saint', 'st', 'our', 'lady', 'of', 'the', 'a', 'an',
]);
/**
* Normalise a church name for comparison:
* lowercase, strip accents, remove noise words, collapse whitespace.
*/
export function normalizeName(name: string): string {
return name
.toLowerCase()
.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
.replace(/[^a-z0-9\s]/g, ' ')
.split(/\s+/)
.filter(w => w.length >= 2 && !NOISE_WORDS.has(w))
.join(' ')
.trim();
}
function wordOverlap(a: string, b: string): number {
const setA = new Set(a.split(' ').filter(Boolean));
const setB = new Set(b.split(' ').filter(Boolean));
if (setA.size === 0 || setB.size === 0) return 0;
let intersection = 0;
for (const w of setA) if (setB.has(w)) intersection++;
const union = setA.size + setB.size - intersection;
return intersection / union;
}
/**
* Find the best-matching existing church for a parsed entry.
* Returns null if no match meets the threshold.
*/
export function findMatch(
locationName: string,
address: string | null,
existing: ExistingChurch[]
): ExistingChurch | null {
const normTarget = normalizeName(locationName);
let best: ExistingChurch | null = null;
let bestScore = 0;
for (const church of existing) {
const normExisting = normalizeName(church.name);
const score = wordOverlap(normTarget, normExisting);
if (score > bestScore) {
bestScore = score;
best = church;
}
}
if (bestScore >= 0.4) return best;
// Fallback: address prefix match (first 12 chars)
if (address && address.length >= 5) {
const addrPrefix = address.slice(0, 12).toLowerCase();
for (const church of existing) {
if (church.address?.toLowerCase().includes(addrPrefix)) return church;
}
}
return null;
}
// ─── DB Operations ────────────────────────────────────────────────────────────
async function upsertChurch(
entry: ParsedEntry,
matched: ExistingChurch | null,
dryRun: boolean,
stats: ImportStats
): Promise<void> {
const tag = matched ? `[MATCH] ${matched.name}${entry.locationName}` : `[NEW] ${entry.locationName}`;
const schedCount = entry.schedules.length;
if (dryRun) {
console.log(tag);
if (!matched && entry.address) console.log(` Address: ${entry.address}`);
console.log(` ${schedCount} schedules`);
if (matched) stats.matched++; else stats.created++;
stats.schedulesWritten += schedCount;
return;
}
if (matched) {
const update: Record<string, string> = {};
if (!matched.phone && entry.phone) update.phone = entry.phone;
if (!matched.email && entry.email) update.email = entry.email;
await prisma.$transaction(async tx => {
if (Object.keys(update).length > 0) {
await tx.church.update({ where: { id: matched.id }, data: update });
}
await tx.massSchedule.deleteMany({ where: { churchId: matched.id } });
if (entry.schedules.length > 0) {
await tx.massSchedule.createMany({
data: entry.schedules.map(s => ({
churchId: matched.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: s.language,
notes: s.notes ?? null,
})),
});
}
});
stats.matched++;
} else {
const newChurch = await prisma.church.create({
data: {
name: entry.locationName,
country: 'HK',
source: 'diocese-hk',
address: entry.address ?? undefined,
phone: entry.phone ?? undefined,
email: entry.email ?? undefined,
latitude: 0,
longitude: 0,
hasWebsite: false,
},
});
if (entry.schedules.length > 0) {
await prisma.massSchedule.createMany({
data: entry.schedules.map(s => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: s.language,
notes: s.notes ?? null,
})),
});
}
stats.created++;
}
stats.schedulesWritten += schedCount;
console.log(tag);
}
// ─── Main ─────────────────────────────────────────────────────────────────────
async function main() {
const args = process.argv.slice(2);
const dryRun = args.includes('--dry-run');
const fileArgIdx = args.indexOf('--file');
const filePath = fileArgIdx >= 0 ? args[fileArgIdx + 1] : path.resolve(process.cwd(), 'scripts/hk-parishes.txt');
console.log(`\n${'='.repeat(60)}`);
console.log(`HK Diocese Parish Import`);
console.log(`File: ${filePath}`);
console.log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
console.log(`${'='.repeat(60)}\n`);
const raw = fs.readFileSync(filePath, 'utf-8');
const entryStrings = splitEntries(raw);
console.log(`Found ${entryStrings.length} entries in file\n`);
const existing = await prisma.church.findMany({
where: { country: 'HK' },
select: { id: true, name: true, address: true, phone: true, email: true },
});
console.log(`Loaded ${existing.length} existing HK churches\n`);
const stats: ImportStats = { matched: 0, created: 0, schedulesWritten: 0, skipped: 0 };
for (const entryStr of entryStrings) {
let entry: ParsedEntry;
try {
entry = parseEntry(entryStr);
} catch (err) {
console.warn(`[SKIP] Failed to parse entry: ${(err as Error).message}`);
stats.skipped++;
continue;
}
if (!entry.locationName || entry.locationName === 'Unknown') {
stats.skipped++;
continue;
}
const matched = findMatch(entry.locationName, entry.address, existing);
await upsertChurch(entry, matched, dryRun, stats);
}
console.log(`\n${'='.repeat(60)}`);
console.log(`Import Summary`);
console.log(`${'='.repeat(60)}`);
console.log(`Matched existing: ${stats.matched}`);
console.log(`New churches: ${stats.created}`);
console.log(`Skipped: ${stats.skipped}`);
console.log(`Schedules written: ${stats.schedulesWritten}`);
console.log(`${'='.repeat(60)}\n`);
await prisma.$disconnect();
await pool.end();
}
// Only run when executed directly (not imported by tests)
import { fileURLToPath } from 'url';
if (process.argv[1] === fileURLToPath(import.meta.url)) {
main().catch(err => {
console.error('Fatal error:', err);
process.exit(1);
});
}