Files
ScraperControl/scripts/import-hk-parishes.ts
albertfj114 eedb442e78 feat: add full entry parser for HK parishes
parseEntry composes extractNames, extractFields, parseScheduleLine,
and parseWeekdayLine into a single ParsedEntry. Routes schedule
lines by section header (Sunday/Anticipated/Weekday) and skips
Special Masses and Eucharist Adoration sections.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 16:18:05 -04:00

372 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env tsx
/**
* Import HK Diocese parish directory from plain-text paste.
* Usage: npx tsx scripts/import-hk-parishes.ts [--dry-run] [--file scripts/hk-parishes.txt]
*/
import dotenv from 'dotenv';
import path from 'path';
import fs from 'fs';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
// ─── Types ────────────────────────────────────────────────────────────────────
export interface ParsedSchedule {
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
time: string; // "HH:MM"
language: string; // "English" | "Cantonese" | "Tagalog"
notes: string | null;
}
export interface ParsedEntry {
locationName: string;
parishName: string | null;
address: string | null;
phone: string | null;
email: string | null;
schedules: ParsedSchedule[];
}
interface ExistingChurch {
id: string;
name: string;
address: string | null;
phone: string | null;
email: string | null;
}
interface ImportStats {
matched: number;
created: number;
schedulesWritten: number;
skipped: number;
}
// ─── Parser ───────────────────────────────────────────────────────────────────
const ARTIFACT_LINES = new Set(['share', 'path', 'close', '']);
const LANG_PATTERN = /(Cantonese|English|Tagalog|Chinese)/i;
// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
/**
* Split raw file text into individual entry strings.
* Entries are delimited by "Path\nClose" which appears in every entry.
* The header segment ("HONG KONG CHURCHES\n\n...") before the first entry is discarded.
*/
export function splitEntries(raw: string): string[] {
const text = raw.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
const parts = text.split('\nPath\nClose\n');
const entries: string[] = [];
for (let i = 1; i < parts.length; i++) {
const pre = parts[i - 1];
const body = parts[i];
entries.push(pre + '\nPath\nClose\n' + body);
}
return entries;
}
/**
* Extract location name and parish name from the pre-marker text of an entry.
*/
export function extractNames(preMarker: string): { locationName: string; parishName: string | null } {
const lines = preMarker
.split('\n')
.map(l => l.trimEnd())
.filter(l => {
const lower = l.trim().toLowerCase();
return !ARTIFACT_LINES.has(lower) && !l.startsWith(' ');
})
.filter(l => l.trim().length > 0);
const nameLines = lines.slice(-2);
if (nameLines.length === 0) return { locationName: 'Unknown', parishName: null };
if (nameLines.length === 1) return { locationName: nameLines[0].trim(), parishName: null };
return {
locationName: nameLines[1].trim(),
parishName: nameLines[0].trim(),
};
}
// ─── Task 3: Field extractor ──────────────────────────────────────────────────
/**
* Extract address, phone, email from the entry body (text after Path/Close).
* Full-width parentheses are normalised to ASCII ( ).
*/
export function extractFields(body: string): { address: string | null; phone: string | null; email: string | null } {
const normalise = (s: string) => s.replace(//g, '(').replace(//g, ')').trim();
function extractField(fieldName: string): string | null {
const regex = new RegExp(`\\b${fieldName}\\n([\\s\\S]*?)(?:\\n\\n|\\nFax|\\nEmail|\\nWebsite|\\nChurch|\\nParish|\\nAssistant|\\nDeacon|\\nSister|\\nChairperson|\\nResident|\\nRector|\\nP\\.C|\\nPastoral|\\nMass Time|$)`, 'i');
const m = body.match(regex);
if (!m) return null;
const value = m[1].replace(/\n/g, ' ').trim();
return value || null;
}
const address = extractField('Address');
const rawPhone = extractField('Phone');
const email = extractField('Email');
return {
address: address ? normalise(address) : null,
phone: rawPhone ? normalise(rawPhone) : null,
email: email || null,
};
}
// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
/**
* Normalise a time string to "HH:MM" 24-hour format.
* Accepts: "8:00am", "11:30 am", "7:00 a.m.", "12:00 noon", etc.
* Returns null if no valid time found.
*/
export function normalizeTime(raw: string): string | null {
const s = raw.trim().toLowerCase();
if (s.includes('noon')) {
if (s === 'noon') return '12:00';
const m = s.match(/(\d{1,2}):(\d{2})\s*noon/);
if (m) return `${String(parseInt(m[1], 10)).padStart(2, '0')}:${m[2]}`;
}
const m = s.match(/(\d{1,2}):(\d{2})\s*(am|pm|a\.m\.|p\.m\.)/);
if (!m) return null;
let h = parseInt(m[1], 10);
const min = parseInt(m[2], 10);
const period = m[3].replace(/\./g, '').toLowerCase();
if (period === 'am') {
if (h === 12) h = 0;
} else {
if (h !== 12) h += 12;
}
return `${String(h).padStart(2, '0')}:${String(min).padStart(2, '0')}`;
}
// ─── Task 5: Schedule line parser ────────────────────────────────────────────
const CONDITIONAL_PATTERN = /^([\w\s]+(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|month)[^:]*:)\s*/i;
/**
* Parse a single schedule text line into 0-N ParsedSchedule records.
* dayOfWeek: the fixed day for this section (0=Sun, 6=Sat for Anticipated).
*/
export function parseScheduleLine(line: string, dayOfWeek: number): ParsedSchedule[] {
let remainder = line.trim();
let language = 'English';
let sectionNotes: string | null = null;
// Extract language tag (with or without closing paren)
const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
if (langMatch) {
const raw = langMatch[1].toLowerCase();
language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
: raw === 'tagalog' ? 'Tagalog'
: 'English';
remainder = remainder.replace(langMatch[0], '').trim();
}
// Strip "Saturday" / "on Saturday" anchors (Anticipated Sunday section)
remainder = remainder.replace(/\bSaturday\b/gi, '').replace(/\bon\b/gi, '').trim();
// Extract conditional note prefix
const condMatch = remainder.match(CONDITIONAL_PATTERN);
if (condMatch) {
sectionNotes = condMatch[1].replace(/:$/, '').trim();
remainder = remainder.slice(condMatch[0].length);
}
// Split by comma into time tokens
const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
const results: ParsedSchedule[] = [];
for (const token of tokens) {
const time = normalizeTime(token);
if (!time) continue;
// Anything in the token that isn't the time or period is a note
const noteText = token
.replace(/\d{1,2}:\d{2}\s*(am|pm|a\.m\.|p\.m\.|noon)/i, '')
.replace(/\s+/g, ' ')
.trim() || null;
results.push({
dayOfWeek,
time,
language,
notes: noteText || sectionNotes,
});
}
return results;
}
// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
const DAY_ABBREV: Record<string, number> = {
mon: 1, tue: 2, wed: 3, thur: 4, thu: 4, fri: 5, sat: 6, sun: 0,
};
const DAY_FULL: Record<string, number> = {
monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6, sunday: 0,
};
function parseDays(prefix: string): number[] {
const s = prefix.toLowerCase().replace(/\./g, '').replace(/:/g, '').trim();
// Range: "monday to friday" or "mon to sat"
const rangeMatch = s.match(/(\w+)\s+to\s+(\w+)/);
if (rangeMatch) {
const fromDay = DAY_FULL[rangeMatch[1]] ?? DAY_ABBREV[rangeMatch[1]];
const toDay = DAY_FULL[rangeMatch[2]] ?? DAY_ABBREV[rangeMatch[2]];
if (fromDay !== undefined && toDay !== undefined) {
const days: number[] = [];
let d = fromDay;
while (d !== toDay) { days.push(d); d = (d + 1) % 7; }
days.push(toDay);
return days;
}
}
// List: "mon, tue, thur" or "tue & sat"
const tokens = s.split(/[,&\s]+/).map(t => t.trim()).filter(Boolean);
const days = tokens
.map(t => DAY_FULL[t] ?? DAY_ABBREV[t])
.filter((d): d is number => d !== undefined);
return [...new Set(days)];
}
// Matches a day-prefix at the start of a token (requires trailing space/colon)
const DAY_PREFIX_RE = /^((?:(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*(?:[,&]\s*(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*)*(?:to\s+\w+\.?\s*)?)|(?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+to\s+\w+)?))[\s:]+/i;
// Matches a token that is ONLY a day (or day list) with no time — e.g. "Mon." "Tue."
const PURE_DAY_RE = /^((?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?|(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\.?$/i;
/**
* Parse a weekday mass line that may have day prefixes.
* Algorithm: split by comma, process each token; track current days across tokens.
*/
export function parseWeekdayLine(line: string): ParsedSchedule[] {
let remainder = line.trim();
let language = 'English';
const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
if (langMatch) {
const raw = langMatch[1].toLowerCase();
language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
: raw === 'tagalog' ? 'Tagalog' : 'English';
remainder = remainder.replace(langMatch[0], '').replace(/\s*\(\s*$/, '').trim();
}
const results: ParsedSchedule[] = [];
const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
let currentDays: number[] = [1, 2, 3, 4, 5]; // default MonFri
let accumulatedDays: number[] = []; // day-only tokens accumulate here until a time appears
for (const token of tokens) {
const prefixMatch = token.match(DAY_PREFIX_RE);
if (prefixMatch) {
const days = parseDays(prefixMatch[1]);
const timePart = token.slice(prefixMatch[0].length);
const time = normalizeTime(timePart);
if (time) {
// Merge any previously accumulated day-only tokens with this token's days
const mergedDays = accumulatedDays.length > 0
? [...new Set([...accumulatedDays, ...days])]
: days.length > 0 ? days : currentDays;
accumulatedDays = [];
if (mergedDays.length > 0) currentDays = mergedDays;
for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
} else {
// Day-only token via prefix match: accumulate
if (days.length > 0) accumulatedDays.push(...days);
}
} else if (PURE_DAY_RE.test(token)) {
// Pure day token like "Mon." "Tue." "Tuesday" — accumulate
const days = parseDays(token);
if (days.length > 0) accumulatedDays.push(...days);
} else {
const time = normalizeTime(token);
if (time) {
// Apply any accumulated days, then reset
if (accumulatedDays.length > 0) {
currentDays = [...new Set(accumulatedDays)];
accumulatedDays = [];
}
for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
}
}
}
return results;
}
// ─── Task 7: Full entry parser ────────────────────────────────────────────────
const SKIP_SECTIONS = new Set(['special masses', 'eucharist adoration']);
/**
* Parse a full raw entry string (including pre-marker names) into a ParsedEntry.
*/
export function parseEntry(raw: string): ParsedEntry {
const markerIdx = raw.indexOf('\nPath\nClose\n');
const pre = markerIdx >= 0 ? raw.slice(0, markerIdx) : '';
const body = markerIdx >= 0 ? raw.slice(markerIdx + '\nPath\nClose\n'.length) : raw;
const { locationName, parishName } = extractNames(pre);
const { address, phone, email } = extractFields(body);
const schedules: ParsedSchedule[] = [];
const massSectionMatch = body.match(/Mass Time\n([\s\S]*?)(?:Share\n|$)/i);
if (massSectionMatch) {
const massText = massSectionMatch[1];
const lines = massText.split('\n');
let currentSection: string | null = null;
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
const lower = trimmed.toLowerCase();
if (lower === 'sunday masses') { currentSection = 'sunday'; continue; }
if (lower === 'anticipated sunday masses') { currentSection = 'anticipated'; continue; }
if (lower === 'weekday masses') { currentSection = 'weekday'; continue; }
if (SKIP_SECTIONS.has(lower)) { currentSection = 'skip'; continue; }
if (currentSection === 'skip') continue;
if (currentSection === null) continue;
if (currentSection === 'sunday') {
schedules.push(...parseScheduleLine(trimmed, 0));
} else if (currentSection === 'anticipated') {
schedules.push(...parseScheduleLine(trimmed, 6));
} else if (currentSection === 'weekday') {
schedules.push(...parseWeekdayLine(trimmed));
}
}
}
return { locationName, parishName, address, phone, email, schedules };
}