#!/usr/bin/env tsx /** * Import HK Diocese parish directory from plain-text paste. * Usage: npx tsx scripts/import-hk-parishes.ts [--dry-run] [--file scripts/hk-parishes.txt] */ import dotenv from 'dotenv'; import path from 'path'; import fs from 'fs'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); // ─── Types ──────────────────────────────────────────────────────────────────── export interface ParsedSchedule { dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat time: string; // "HH:MM" language: string; // "English" | "Cantonese" | "Tagalog" notes: string | null; } export interface ParsedEntry { locationName: string; parishName: string | null; address: string | null; phone: string | null; email: string | null; schedules: ParsedSchedule[]; } interface ExistingChurch { id: string; name: string; address: string | null; phone: string | null; email: string | null; } interface ImportStats { matched: number; created: number; schedulesWritten: number; skipped: number; } // ─── Parser ─────────────────────────────────────────────────────────────────── const ARTIFACT_LINES = new Set(['share', 'path', 'close', '']); const LANG_PATTERN = /(Cantonese|English|Tagalog|Chinese)/i; // ─── Task 2: Entry splitter and name extractor ──────────────────────────────── /** * Split raw file text into individual entry strings. * Entries are delimited by "Path\nClose" which appears in every entry. * The header segment ("HONG KONG CHURCHES\n\n...") before the first entry is discarded. */ export function splitEntries(raw: string): string[] { const text = raw.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); const parts = text.split('\nPath\nClose\n'); const entries: string[] = []; for (let i = 1; i < parts.length; i++) { const pre = parts[i - 1]; const body = parts[i]; entries.push(pre + '\nPath\nClose\n' + body); } return entries; } /** * Extract location name and parish name from the pre-marker text of an entry. */ export function extractNames(preMarker: string): { locationName: string; parishName: string | null } { const lines = preMarker .split('\n') .map(l => l.trimEnd()) .filter(l => { const lower = l.trim().toLowerCase(); return !ARTIFACT_LINES.has(lower) && !l.startsWith(' '); }) .filter(l => l.trim().length > 0); const nameLines = lines.slice(-2); if (nameLines.length === 0) return { locationName: 'Unknown', parishName: null }; if (nameLines.length === 1) return { locationName: nameLines[0].trim(), parishName: null }; return { locationName: nameLines[1].trim(), parishName: nameLines[0].trim(), }; } // ─── Task 3: Field extractor ────────────────────────────────────────────────── /** * Extract address, phone, email from the entry body (text after Path/Close). * Full-width parentheses ( ) are normalised to ASCII ( ). */ export function extractFields(body: string): { address: string | null; phone: string | null; email: string | null } { const normalise = (s: string) => s.replace(/(/g, '(').replace(/)/g, ')').trim(); function extractField(fieldName: string): string | null { const regex = new RegExp(`\\b${fieldName}\\n([\\s\\S]*?)(?:\\n\\n|\\nFax|\\nEmail|\\nWebsite|\\nChurch|\\nParish|\\nAssistant|\\nDeacon|\\nSister|\\nChairperson|\\nResident|\\nRector|\\nP\\.C|\\nPastoral|\\nMass Time|$)`, 'i'); const m = body.match(regex); if (!m) return null; const value = m[1].replace(/\n/g, ' ').trim(); return value || null; } const address = extractField('Address'); const rawPhone = extractField('Phone'); const email = extractField('Email'); return { address: address ? normalise(address) : null, phone: rawPhone ? normalise(rawPhone) : null, email: email || null, }; } // ─── Task 4: Time normalizer ────────────────────────────────────────────────── /** * Normalise a time string to "HH:MM" 24-hour format. * Accepts: "8:00am", "11:30 am", "7:00 a.m.", "12:00 noon", etc. * Returns null if no valid time found. */ export function normalizeTime(raw: string): string | null { const s = raw.trim().toLowerCase(); if (s.includes('noon')) { if (s === 'noon') return '12:00'; const m = s.match(/(\d{1,2}):(\d{2})\s*noon/); if (m) return `${String(parseInt(m[1], 10)).padStart(2, '0')}:${m[2]}`; } const m = s.match(/(\d{1,2}):(\d{2})\s*(am|pm|a\.m\.|p\.m\.)/); if (!m) return null; let h = parseInt(m[1], 10); const min = parseInt(m[2], 10); const period = m[3].replace(/\./g, '').toLowerCase(); if (period === 'am') { if (h === 12) h = 0; } else { if (h !== 12) h += 12; } return `${String(h).padStart(2, '0')}:${String(min).padStart(2, '0')}`; } // ─── Task 5: Schedule line parser ──────────────────────────────────────────── const CONDITIONAL_PATTERN = /^([\w\s]+(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|month)[^:]*:)\s*/i; /** * Parse a single schedule text line into 0-N ParsedSchedule records. * dayOfWeek: the fixed day for this section (0=Sun, 6=Sat for Anticipated). */ export function parseScheduleLine(line: string, dayOfWeek: number): ParsedSchedule[] { let remainder = line.trim(); let language = 'English'; let sectionNotes: string | null = null; // Extract language tag (with or without closing paren) const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i); if (langMatch) { const raw = langMatch[1].toLowerCase(); language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese' : raw === 'tagalog' ? 'Tagalog' : 'English'; remainder = remainder.replace(langMatch[0], '').trim(); } // Strip "Saturday" / "on Saturday" anchors (Anticipated Sunday section) remainder = remainder.replace(/\bSaturday\b/gi, '').replace(/\bon\b/gi, '').trim(); // Extract conditional note prefix const condMatch = remainder.match(CONDITIONAL_PATTERN); if (condMatch) { sectionNotes = condMatch[1].replace(/:$/, '').trim(); remainder = remainder.slice(condMatch[0].length); } // Split by comma into time tokens const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean); const results: ParsedSchedule[] = []; for (const token of tokens) { const time = normalizeTime(token); if (!time) continue; // Anything in the token that isn't the time or period is a note const noteText = token .replace(/\d{1,2}:\d{2}\s*(am|pm|a\.m\.|p\.m\.|noon)/i, '') .replace(/\s+/g, ' ') .trim() || null; results.push({ dayOfWeek, time, language, notes: noteText || sectionNotes, }); } return results; } // ─── Task 6: Weekday day-prefix parser ─────────────────────────────────────── const DAY_ABBREV: Record = { mon: 1, tue: 2, wed: 3, thur: 4, thu: 4, fri: 5, sat: 6, sun: 0, }; const DAY_FULL: Record = { monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6, sunday: 0, }; function parseDays(prefix: string): number[] { const s = prefix.toLowerCase().replace(/\./g, '').replace(/:/g, '').trim(); // Range: "monday to friday" or "mon to sat" const rangeMatch = s.match(/(\w+)\s+to\s+(\w+)/); if (rangeMatch) { const fromDay = DAY_FULL[rangeMatch[1]] ?? DAY_ABBREV[rangeMatch[1]]; const toDay = DAY_FULL[rangeMatch[2]] ?? DAY_ABBREV[rangeMatch[2]]; if (fromDay !== undefined && toDay !== undefined) { const days: number[] = []; let d = fromDay; while (d !== toDay) { days.push(d); d = (d + 1) % 7; } days.push(toDay); return days; } } // List: "mon, tue, thur" or "tue & sat" const tokens = s.split(/[,&\s]+/).map(t => t.trim()).filter(Boolean); const days = tokens .map(t => DAY_FULL[t] ?? DAY_ABBREV[t]) .filter((d): d is number => d !== undefined); return [...new Set(days)]; } // Matches a day-prefix at the start of a token (requires trailing space/colon) const DAY_PREFIX_RE = /^((?:(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*(?:[,&]\s*(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*)*(?:to\s+\w+\.?\s*)?)|(?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+to\s+\w+)?))[\s:]+/i; // Matches a token that is ONLY a day (or day list) with no time — e.g. "Mon." "Tue." const PURE_DAY_RE = /^((?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?|(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\.?$/i; /** * Parse a weekday mass line that may have day prefixes. * Algorithm: split by comma, process each token; track current days across tokens. */ export function parseWeekdayLine(line: string): ParsedSchedule[] { let remainder = line.trim(); let language = 'English'; const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i); if (langMatch) { const raw = langMatch[1].toLowerCase(); language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese' : raw === 'tagalog' ? 'Tagalog' : 'English'; remainder = remainder.replace(langMatch[0], '').replace(/\s*\(\s*$/, '').trim(); } const results: ParsedSchedule[] = []; const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean); let currentDays: number[] = [1, 2, 3, 4, 5]; // default Mon–Fri let accumulatedDays: number[] = []; // day-only tokens accumulate here until a time appears for (const token of tokens) { const prefixMatch = token.match(DAY_PREFIX_RE); if (prefixMatch) { const days = parseDays(prefixMatch[1]); const timePart = token.slice(prefixMatch[0].length); const time = normalizeTime(timePart); if (time) { // Merge any previously accumulated day-only tokens with this token's days const mergedDays = accumulatedDays.length > 0 ? [...new Set([...accumulatedDays, ...days])] : days.length > 0 ? days : currentDays; accumulatedDays = []; if (mergedDays.length > 0) currentDays = mergedDays; for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null }); } else { // Day-only token via prefix match: accumulate if (days.length > 0) accumulatedDays.push(...days); } } else if (PURE_DAY_RE.test(token)) { // Pure day token like "Mon." "Tue." "Tuesday" — accumulate const days = parseDays(token); if (days.length > 0) accumulatedDays.push(...days); } else { const time = normalizeTime(token); if (time) { // Apply any accumulated days, then reset if (accumulatedDays.length > 0) { currentDays = [...new Set(accumulatedDays)]; accumulatedDays = []; } for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null }); } } } return results; } // ─── Task 7: Full entry parser ──────────────────────────────────────────────── const SKIP_SECTIONS = new Set(['special masses', 'eucharist adoration']); /** * Parse a full raw entry string (including pre-marker names) into a ParsedEntry. */ export function parseEntry(raw: string): ParsedEntry { const markerIdx = raw.indexOf('\nPath\nClose\n'); const pre = markerIdx >= 0 ? raw.slice(0, markerIdx) : ''; const body = markerIdx >= 0 ? raw.slice(markerIdx + '\nPath\nClose\n'.length) : raw; const { locationName, parishName } = extractNames(pre); const { address, phone, email } = extractFields(body); const schedules: ParsedSchedule[] = []; const massSectionMatch = body.match(/Mass Time\n([\s\S]*?)(?:Share\n|$)/i); if (massSectionMatch) { const massText = massSectionMatch[1]; const lines = massText.split('\n'); let currentSection: string | null = null; for (const line of lines) { const trimmed = line.trim(); if (!trimmed) continue; const lower = trimmed.toLowerCase(); if (lower === 'sunday masses') { currentSection = 'sunday'; continue; } if (lower === 'anticipated sunday masses') { currentSection = 'anticipated'; continue; } if (lower === 'weekday masses') { currentSection = 'weekday'; continue; } if (SKIP_SECTIONS.has(lower)) { currentSection = 'skip'; continue; } if (currentSection === 'skip') continue; if (currentSection === null) continue; if (currentSection === 'sunday') { schedules.push(...parseScheduleLine(trimmed, 0)); } else if (currentSection === 'anticipated') { schedules.push(...parseScheduleLine(trimmed, 6)); } else if (currentSection === 'weekday') { schedules.push(...parseWeekdayLine(trimmed)); } } } return { locationName, parishName, address, phone, email, schedules }; }