feat: add DB operations and CLI wiring for HK parish import

upsertChurch() handles matched churches (replace schedules atomically via $transaction, update contact fields if null) and new churches (create with source='diocese-hk', lat/lng=0 for later geocoding). main() wires up CLI args, file reading, matching loop, and summary. Guards main() call with ESM import.meta.url check to prevent execution on import during tests. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
fix: use true Jaccard similarity in wordOverlap (intersection/union)
2026-04-03 16:27:02 -04:00 · 2026-04-03 16:25:24 -04:00 · 2026-04-03 16:23:58 -04:00 · 2026-04-03 16:18:05 -04:00 · 2026-04-03 16:15:04 -04:00 · 2026-04-03 16:06:26 -04:00
2 changed files with 828 additions and 0 deletions
--- a/scripts/import-hk-parishes.test.ts
+++ b/scripts/import-hk-parishes.test.ts
@@ -0,0 +1,244 @@
+import { test } from 'node:test';
+import assert from 'node:assert/strict';
+import {
+  splitEntries,
+  extractNames,
+  extractFields,
+  normalizeTime,
+  parseScheduleLine,
+  parseWeekdayLine,
+  parseEntry,
+  normalizeName,
+  findMatch,
+} from './import-hk-parishes.js';
+
+// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
+
+test('splitEntries splits on Path/Close boundary', () => {
+  const raw = `HONG KONG CHURCHES\n\nParish A\nChurch A\nPath\nClose\nAddress\n1 Main St\n\nParish B\nChurch B\nPath\nClose\nAddress\n2 Side St\n`;
+  const entries = splitEntries(raw);
+  assert.equal(entries.length, 2);
+  assert.ok(entries[0].includes('Church A'));
+  assert.ok(entries[1].includes('Church B'));
+});
+
+test('extractNames returns locationName and parishName', () => {
+  const pre = `Holy Cross Parish\nHOLY CROSS CHURCH`;
+  const result = extractNames(pre);
+  assert.equal(result.locationName, 'HOLY CROSS CHURCH');
+  assert.equal(result.parishName, 'Holy Cross Parish');
+});
+
+test('extractNames strips Share and leading-space artifacts', () => {
+  const pre = `Share\n Carmelite Monastery\nSt. Anne's Parish\nCarmelite Monastery`;
+  const result = extractNames(pre);
+  assert.equal(result.locationName, 'Carmelite Monastery');
+  assert.equal(result.parishName, "St. Anne's Parish");
+});
+
+test('extractNames handles single name line', () => {
+  const pre = `Cathedral Parish`;
+  const result = extractNames(pre);
+  assert.equal(result.locationName, 'Cathedral Parish');
+  assert.equal(result.parishName, null);
+});
+
+// ─── Task 3: Field extractor ──────────────────────────────────────────────────
+
+test('extractFields parses address, phone, email', () => {
+  const body = `Address\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n（852）2560-1823\n\nFax\n（852）2535-8246\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\n`;
+  const f = extractFields(body);
+  assert.equal(f.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong');
+  assert.equal(f.phone, '(852)2560-1823');
+  assert.equal(f.email, 'holycrosshk@gmail.com');
+});
+
+test('extractFields handles missing fields gracefully', () => {
+  const body = `Address\nSalesian School, 16 Chai Wan Road, Hong Kong.\n\nMass Time\n`;
+  const f = extractFields(body);
+  assert.equal(f.address, 'Salesian School, 16 Chai Wan Road, Hong Kong.');
+  assert.equal(f.phone, null);
+  assert.equal(f.email, null);
+});
+
+test('extractFields strips full-width parens from phone', () => {
+  const body = `Phone\n（852）2819-5777, 2819-5845\n\n`;
+  const f = extractFields(body);
+  assert.equal(f.phone, '(852)2819-5777, 2819-5845');
+});
+
+// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
+
+test('normalizeTime handles am/pm with spaces', () => {
+  assert.equal(normalizeTime('8:00am'), '08:00');
+  assert.equal(normalizeTime('11:30 am'), '11:30');
+  assert.equal(normalizeTime('6:00pm'), '18:00');
+  assert.equal(normalizeTime('6:30 pm'), '18:30');
+});
+
+test('normalizeTime handles a.m./p.m. format', () => {
+  assert.equal(normalizeTime('7:00 a.m.'), '07:00');
+  assert.equal(normalizeTime('7:45 a.m.'), '07:45');
+  assert.equal(normalizeTime('6:00 p.m.'), '18:00');
+});
+
+test('normalizeTime handles noon', () => {
+  assert.equal(normalizeTime('12:00 noon'), '12:00');
+  assert.equal(normalizeTime('12:30 pm'), '12:30');
+});
+
+test('normalizeTime handles 12:00am as midnight', () => {
+  assert.equal(normalizeTime('12:00am'), '00:00');
+});
+
+test('normalizeTime returns null for unrecognised input', () => {
+  assert.equal(normalizeTime('Monday'), null);
+  assert.equal(normalizeTime(''), null);
+});
+
+// ─── Task 5: Schedule line parser ────────────────────────────────────────────
+
+test('parseScheduleLine parses single time with language', () => {
+  const results = parseScheduleLine('9:30am (English)', 0);
+  assert.equal(results.length, 1);
+  assert.deepEqual(results[0], { dayOfWeek: 0, time: '09:30', language: 'English', notes: null });
+});
+
+test('parseScheduleLine parses multiple comma-separated times', () => {
+  const results = parseScheduleLine('8:00am,10:30 am (Cantonese)', 0);
+  assert.equal(results.length, 2);
+  assert.equal(results[0].time, '08:00');
+  assert.equal(results[1].time, '10:30');
+  assert.equal(results[1].language, 'Cantonese');
+});
+
+test('parseScheduleLine handles missing closing paren', () => {
+  const results = parseScheduleLine('9:30 am (Cantonese', 0);
+  assert.equal(results[0].language, 'Cantonese');
+});
+
+test('parseScheduleLine defaults language to English when not specified', () => {
+  const results = parseScheduleLine('8:00am', 0);
+  assert.equal(results[0].language, 'English');
+});
+
+test('parseScheduleLine stores embedded note text', () => {
+  const results = parseScheduleLine('9:00 am Sunday School & Family Mass,11:30am (English)', 0);
+  assert.equal(results.length, 2);
+  assert.equal(results[0].time, '09:00');
+  assert.equal(results[0].notes, 'Sunday School & Family Mass');
+});
+
+test('parseScheduleLine handles Saturday anticipated format variations', () => {
+  const results = parseScheduleLine('Saturday 3:45 pm,Saturday 6:30 pm (Cantonese)', 6);
+  assert.equal(results.length, 2);
+  assert.equal(results[0].time, '15:45');
+  assert.equal(results[1].time, '18:30');
+});
+
+test('parseScheduleLine handles "on Saturday" suffix format', () => {
+  const results = parseScheduleLine('6:00pm on Saturday (Cantonese)', 6);
+  assert.equal(results.length, 1);
+  assert.equal(results[0].time, '18:00');
+  assert.equal(results[0].language, 'Cantonese');
+});
+
+test('parseScheduleLine handles conditional prefix as notes', () => {
+  const results = parseScheduleLine('5th Sunday of the month: 7:15 am (Tagalog)', 0);
+  assert.equal(results.length, 1);
+  assert.equal(results[0].time, '07:15');
+  assert.equal(results[0].language, 'Tagalog');
+  assert.ok(results[0].notes?.includes('5th Sunday'));
+});
+
+// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
+
+test('parseWeekdayLine no prefix = all weekdays Mon-Fri', () => {
+  const results = parseWeekdayLine('7:15 am (Cantonese)');
+  assert.equal(results.length, 5);
+  assert.ok(results.every(r => r.time === '07:15'));
+  assert.ok(results.every(r => r.language === 'Cantonese'));
+  assert.deepEqual(results.map(r => r.dayOfWeek), [1, 2, 3, 4, 5]);
+});
+
+test('parseWeekdayLine abbreviation list', () => {
+  const results = parseWeekdayLine('Mon., Tue., Thur. 8:00 a.m. (Cantonese)');
+  assert.equal(results.length, 3);
+  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 4]);
+});
+
+test('parseWeekdayLine abbreviation range Mon. to Sat.', () => {
+  const results = parseWeekdayLine('Mon. to Sat. 9:15 am (English)');
+  assert.equal(results.length, 6);
+  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 3, 4, 5, 6]);
+});
+
+test('parseWeekdayLine full-word range Monday to Friday', () => {
+  const results = parseWeekdayLine('Monday to Friday: 12:00 noon (English)');
+  assert.equal(results.length, 5);
+  assert.ok(results.every(r => r.time === '12:00'));
+});
+
+test('parseWeekdayLine ampersand separator', () => {
+  const results = parseWeekdayLine('Tue., Thur. & Sat. 9:45 a.m. (Cantonese)');
+  assert.equal(results.length, 3);
+  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [2, 4, 6]);
+});
+
+test('parseWeekdayLine multiple time groups on one line', () => {
+  const results = parseWeekdayLine('Monday to Saturday: 7:45 am,Monday to Friday: 12:00 noon,Monday to Friday: 6:00 pm (English)');
+  assert.equal(results.length, 16);
+});
+
+// ─── Task 7: Full entry parser ────────────────────────────────────────────────
+
+test('parseEntry extracts names, fields, and schedules from a full entry', () => {
+  const raw = `Holy Cross Parish\nHOLY CROSS CHURCH\nPath\nClose\nAddress\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n（852）2560-1823\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\nSunday Masses\n8:00am,9:30am (Cantonese)\n1:00 pm (English)\n\nAnticipated Sunday Masses\nSaturday 3:45 pm,Saturday 6:30 pm (Cantonese)\n\nWeekday Masses\n7:15 am (Cantonese)\n\nSpecial Masses\nSomething irrelevant\n`;
+  const entry = parseEntry(raw);
+  assert.equal(entry.locationName, 'HOLY CROSS CHURCH');
+  assert.equal(entry.parishName, 'Holy Cross Parish');
+  assert.equal(entry.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong');
+  assert.equal(entry.phone, '(852)2560-1823');
+  assert.equal(entry.email, 'holycrosshk@gmail.com');
+  // Sunday: 2 Cantonese + 1 English = 3 entries
+  const sunday = entry.schedules.filter(s => s.dayOfWeek === 0);
+  assert.equal(sunday.length, 3);
+  // Anticipated (Saturday): 2 entries
+  const saturday = entry.schedules.filter(s => s.dayOfWeek === 6);
+  assert.equal(saturday.length, 2);
+  // Weekday: 5 entries (Mon–Fri)
+  const weekday = entry.schedules.filter(s => s.dayOfWeek >= 1 && s.dayOfWeek <= 5);
+  assert.equal(weekday.length, 5);
+});
+
+// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
+
+test('normalizeName strips noise words and lowercases', () => {
+  assert.equal(normalizeName('HOLY CROSS CHURCH'), 'holy cross');
+  assert.equal(normalizeName('Our Lady Of Mount Carmel Church'), 'mount carmel');
+  assert.equal(normalizeName("St. Joseph's Parish"), 'joseph');
+  assert.equal(normalizeName('Salesian Mass Centre'), 'salesian');
+});
+
+test('findMatch matches by name overlap', () => {
+  const existing = [
+    { id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null },
+    { id: '2', name: 'St Joseph (Central)', address: '37 Garden Road', phone: null, email: null },
+  ];
+  assert.equal(findMatch('HOLY CROSS CHURCH', '1 Holy Cross Path', existing)?.id, '1');
+  assert.equal(findMatch("St. Joseph's Church", '37 Garden Road', existing)?.id, '2');
+});
+
+test('findMatch falls back to address prefix match', () => {
+  const existing = [
+    { id: '3', name: '聖母聖衣堂 (Our Lady of Mount Carmel Wanchai)', address: 'No.1, Star Street', phone: null, email: null },
+  ];
+  assert.equal(findMatch('Our Lady Of Mount Carmel Church', 'No.1, Star Street, Wan Chai', existing)?.id, '3');
+});
+
+test('findMatch returns null for no match', () => {
+  const existing = [
+    { id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null },
+  ];
+  assert.equal(findMatch('Salesian Mass Centre', 'Salesian School, 16 Chai Wan Road', existing), null);
+});
--- a/scripts/import-hk-parishes.ts
+++ b/scripts/import-hk-parishes.ts
@@ -0,0 +1,584 @@
+#!/usr/bin/env tsx
+/**
+ * Import HK Diocese parish directory from plain-text paste.
+ * Usage: npx tsx scripts/import-hk-parishes.ts [--dry-run] [--file scripts/hk-parishes.txt]
+ */
+
+import dotenv from 'dotenv';
+import path from 'path';
+import fs from 'fs';
+
+dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
+dotenv.config({ path: path.resolve(process.cwd(), '.env') });
+
+import { Pool } from 'pg';
+import { PrismaPg } from '@prisma/adapter-pg';
+import { PrismaClient } from '@prisma/client';
+
+const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
+console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
+const pool = new Pool({
+  connectionString: dbUrl,
+  ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
+});
+const adapter = new PrismaPg(pool);
+const prisma = new PrismaClient({ adapter });
+
+// ─── Types ────────────────────────────────────────────────────────────────────
+
+export interface ParsedSchedule {
+  dayOfWeek: number;   // 0=Sun, 1=Mon, ..., 6=Sat
+  time: string;        // "HH:MM"
+  language: string;    // "English" | "Cantonese" | "Tagalog"
+  notes: string | null;
+}
+
+export interface ParsedEntry {
+  locationName: string;
+  parishName: string | null;
+  address: string | null;
+  phone: string | null;
+  email: string | null;
+  schedules: ParsedSchedule[];
+}
+
+interface ExistingChurch {
+  id: string;
+  name: string;
+  address: string | null;
+  phone: string | null;
+  email: string | null;
+}
+
+interface ImportStats {
+  matched: number;
+  created: number;
+  schedulesWritten: number;
+  skipped: number;
+}
+
+// ─── Parser ───────────────────────────────────────────────────────────────────
+
+const ARTIFACT_LINES = new Set(['share', 'path', 'close', '']);
+
+const LANG_PATTERN = /(Cantonese|English|Tagalog|Chinese)/i;
+
+// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
+
+/**
+ * Split raw file text into individual entry strings.
+ * Entries are delimited by "Path\nClose" which appears in every entry.
+ * The header segment ("HONG KONG CHURCHES\n\n...") before the first entry is discarded.
+ */
+export function splitEntries(raw: string): string[] {
+  const text = raw.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
+  const parts = text.split('\nPath\nClose\n');
+  const entries: string[] = [];
+  for (let i = 1; i < parts.length; i++) {
+    const pre = parts[i - 1];
+    const body = parts[i];
+    entries.push(pre + '\nPath\nClose\n' + body);
+  }
+  return entries;
+}
+
+/**
+ * Extract location name and parish name from the pre-marker text of an entry.
+ */
+export function extractNames(preMarker: string): { locationName: string; parishName: string | null } {
+  const lines = preMarker
+    .split('\n')
+    .map(l => l.trimEnd())
+    .filter(l => {
+      const lower = l.trim().toLowerCase();
+      return !ARTIFACT_LINES.has(lower) && !l.startsWith(' ');
+    })
+    .filter(l => l.trim().length > 0);
+
+  const nameLines = lines.slice(-2);
+  if (nameLines.length === 0) return { locationName: 'Unknown', parishName: null };
+  if (nameLines.length === 1) return { locationName: nameLines[0].trim(), parishName: null };
+  return {
+    locationName: nameLines[1].trim(),
+    parishName: nameLines[0].trim(),
+  };
+}
+
+// ─── Task 3: Field extractor ──────────────────────────────────────────────────
+
+/**
+ * Extract address, phone, email from the entry body (text after Path/Close).
+ * Full-width parentheses （ ） are normalised to ASCII ( ).
+ */
+export function extractFields(body: string): { address: string | null; phone: string | null; email: string | null } {
+  const normalise = (s: string) => s.replace(/（/g, '(').replace(/）/g, ')').trim();
+
+  function extractField(fieldName: string): string | null {
+    const regex = new RegExp(`\\b${fieldName}\\n([\\s\\S]*?)(?:\\n\\n|\\nFax|\\nEmail|\\nWebsite|\\nChurch|\\nParish|\\nAssistant|\\nDeacon|\\nSister|\\nChairperson|\\nResident|\\nRector|\\nP\\.C|\\nPastoral|\\nMass Time|$)`, 'i');
+    const m = body.match(regex);
+    if (!m) return null;
+    const value = m[1].replace(/\n/g, ' ').trim();
+    return value || null;
+  }
+
+  const address = extractField('Address');
+  const rawPhone = extractField('Phone');
+  const email = extractField('Email');
+
+  return {
+    address: address ? normalise(address) : null,
+    phone: rawPhone ? normalise(rawPhone) : null,
+    email: email || null,
+  };
+}
+
+// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
+
+/**
+ * Normalise a time string to "HH:MM" 24-hour format.
+ * Accepts: "8:00am", "11:30 am", "7:00 a.m.", "12:00 noon", etc.
+ * Returns null if no valid time found.
+ */
+export function normalizeTime(raw: string): string | null {
+  const s = raw.trim().toLowerCase();
+  if (s.includes('noon')) {
+    if (s === 'noon') return '12:00';
+    const m = s.match(/(\d{1,2}):(\d{2})\s*noon/);
+    if (m) return `${String(parseInt(m[1], 10)).padStart(2, '0')}:${m[2]}`;
+  }
+
+  const m = s.match(/(\d{1,2}):(\d{2})\s*(am|pm|a\.m\.|p\.m\.)/);
+  if (!m) return null;
+
+  let h = parseInt(m[1], 10);
+  const min = parseInt(m[2], 10);
+  const period = m[3].replace(/\./g, '').toLowerCase();
+
+  if (period === 'am') {
+    if (h === 12) h = 0;
+  } else {
+    if (h !== 12) h += 12;
+  }
+
+  return `${String(h).padStart(2, '0')}:${String(min).padStart(2, '0')}`;
+}
+
+// ─── Task 5: Schedule line parser ────────────────────────────────────────────
+
+const CONDITIONAL_PATTERN = /^([\w\s]+(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|month)[^:]*:)\s*/i;
+
+/**
+ * Parse a single schedule text line into 0-N ParsedSchedule records.
+ * dayOfWeek: the fixed day for this section (0=Sun, 6=Sat for Anticipated).
+ */
+export function parseScheduleLine(line: string, dayOfWeek: number): ParsedSchedule[] {
+  let remainder = line.trim();
+  let language = 'English';
+  let sectionNotes: string | null = null;
+
+  // Extract language tag (with or without closing paren)
+  const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
+  if (langMatch) {
+    const raw = langMatch[1].toLowerCase();
+    language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
+      : raw === 'tagalog' ? 'Tagalog'
+      : 'English';
+    remainder = remainder.replace(langMatch[0], '').trim();
+  }
+
+  // Strip "Saturday" / "on Saturday" anchors (Anticipated Sunday section)
+  remainder = remainder.replace(/\bSaturday\b/gi, '').replace(/\bon\b/gi, '').trim();
+
+  // Extract conditional note prefix
+  const condMatch = remainder.match(CONDITIONAL_PATTERN);
+  if (condMatch) {
+    sectionNotes = condMatch[1].replace(/:$/, '').trim();
+    remainder = remainder.slice(condMatch[0].length);
+  }
+
+  // Split by comma into time tokens
+  const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
+  const results: ParsedSchedule[] = [];
+
+  for (const token of tokens) {
+    const time = normalizeTime(token);
+    if (!time) continue;
+
+    // Anything in the token that isn't the time or period is a note
+    const noteText = token
+      .replace(/\d{1,2}:\d{2}\s*(am|pm|a\.m\.|p\.m\.|noon)/i, '')
+      .replace(/\s+/g, ' ')
+      .trim() || null;
+
+    results.push({
+      dayOfWeek,
+      time,
+      language,
+      notes: noteText || sectionNotes,
+    });
+  }
+
+  return results;
+}
+
+// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
+
+const DAY_ABBREV: Record<string, number> = {
+  mon: 1, tue: 2, wed: 3, thur: 4, thu: 4, fri: 5, sat: 6, sun: 0,
+};
+const DAY_FULL: Record<string, number> = {
+  monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6, sunday: 0,
+};
+
+function parseDays(prefix: string): number[] {
+  const s = prefix.toLowerCase().replace(/\./g, '').replace(/:/g, '').trim();
+
+  // Range: "monday to friday" or "mon to sat"
+  const rangeMatch = s.match(/(\w+)\s+to\s+(\w+)/);
+  if (rangeMatch) {
+    const fromDay = DAY_FULL[rangeMatch[1]] ?? DAY_ABBREV[rangeMatch[1]];
+    const toDay   = DAY_FULL[rangeMatch[2]] ?? DAY_ABBREV[rangeMatch[2]];
+    if (fromDay !== undefined && toDay !== undefined) {
+      const days: number[] = [];
+      let d = fromDay;
+      while (d !== toDay) { days.push(d); d = (d + 1) % 7; }
+      days.push(toDay);
+      return days;
+    }
+  }
+
+  // List: "mon, tue, thur" or "tue & sat"
+  const tokens = s.split(/[,&\s]+/).map(t => t.trim()).filter(Boolean);
+  const days = tokens
+    .map(t => DAY_FULL[t] ?? DAY_ABBREV[t])
+    .filter((d): d is number => d !== undefined);
+  return [...new Set(days)];
+}
+
+// Matches a day-prefix at the start of a token (requires trailing space/colon)
+const DAY_PREFIX_RE = /^((?:(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*(?:[,&]\s*(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*)*(?:to\s+\w+\.?\s*)?)|(?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+to\s+\w+)?))[\s:]+/i;
+
+// Matches a token that is ONLY a day (or day list) with no time — e.g. "Mon." "Tue."
+const PURE_DAY_RE = /^((?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?|(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\.?$/i;
+
+/**
+ * Parse a weekday mass line that may have day prefixes.
+ * Algorithm: split by comma, process each token; track current days across tokens.
+ */
+export function parseWeekdayLine(line: string): ParsedSchedule[] {
+  let remainder = line.trim();
+  let language = 'English';
+
+  const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
+  if (langMatch) {
+    const raw = langMatch[1].toLowerCase();
+    language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
+      : raw === 'tagalog' ? 'Tagalog' : 'English';
+    remainder = remainder.replace(langMatch[0], '').replace(/\s*\(\s*$/, '').trim();
+  }
+
+  const results: ParsedSchedule[] = [];
+  const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
+  let currentDays: number[] = [1, 2, 3, 4, 5]; // default Mon–Fri
+  let accumulatedDays: number[] = []; // day-only tokens accumulate here until a time appears
+
+  for (const token of tokens) {
+    const prefixMatch = token.match(DAY_PREFIX_RE);
+    if (prefixMatch) {
+      const days = parseDays(prefixMatch[1]);
+      const timePart = token.slice(prefixMatch[0].length);
+      const time = normalizeTime(timePart);
+      if (time) {
+        // Merge any previously accumulated day-only tokens with this token's days
+        const mergedDays = accumulatedDays.length > 0
+          ? [...new Set([...accumulatedDays, ...days])]
+          : days.length > 0 ? days : currentDays;
+        accumulatedDays = [];
+        if (mergedDays.length > 0) currentDays = mergedDays;
+        for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
+      } else {
+        // Day-only token via prefix match: accumulate
+        if (days.length > 0) accumulatedDays.push(...days);
+      }
+    } else if (PURE_DAY_RE.test(token)) {
+      // Pure day token like "Mon." "Tue." "Tuesday" — accumulate
+      const days = parseDays(token);
+      if (days.length > 0) accumulatedDays.push(...days);
+    } else {
+      const time = normalizeTime(token);
+      if (time) {
+        // Apply any accumulated days, then reset
+        if (accumulatedDays.length > 0) {
+          currentDays = [...new Set(accumulatedDays)];
+          accumulatedDays = [];
+        }
+        for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
+      }
+    }
+  }
+
+  return results;
+}
+
+// ─── Task 7: Full entry parser ────────────────────────────────────────────────
+
+const SKIP_SECTIONS = new Set(['special masses', 'eucharist adoration']);
+
+/**
+ * Parse a full raw entry string (including pre-marker names) into a ParsedEntry.
+ */
+export function parseEntry(raw: string): ParsedEntry {
+  const markerIdx = raw.indexOf('\nPath\nClose\n');
+  const pre = markerIdx >= 0 ? raw.slice(0, markerIdx) : '';
+  const body = markerIdx >= 0 ? raw.slice(markerIdx + '\nPath\nClose\n'.length) : raw;
+
+  const { locationName, parishName } = extractNames(pre);
+  const { address, phone, email } = extractFields(body);
+
+  const schedules: ParsedSchedule[] = [];
+
+  const massSectionMatch = body.match(/Mass Time\n([\s\S]*?)(?:Share\n|$)/i);
+  if (massSectionMatch) {
+    const massText = massSectionMatch[1];
+    const lines = massText.split('\n');
+    let currentSection: string | null = null;
+
+    for (const line of lines) {
+      const trimmed = line.trim();
+      if (!trimmed) continue;
+
+      const lower = trimmed.toLowerCase();
+
+      if (lower === 'sunday masses') { currentSection = 'sunday'; continue; }
+      if (lower === 'anticipated sunday masses') { currentSection = 'anticipated'; continue; }
+      if (lower === 'weekday masses') { currentSection = 'weekday'; continue; }
+      if (SKIP_SECTIONS.has(lower)) { currentSection = 'skip'; continue; }
+
+      if (currentSection === 'skip') continue;
+      if (currentSection === null) continue;
+
+      if (currentSection === 'sunday') {
+        schedules.push(...parseScheduleLine(trimmed, 0));
+      } else if (currentSection === 'anticipated') {
+        schedules.push(...parseScheduleLine(trimmed, 6));
+      } else if (currentSection === 'weekday') {
+        schedules.push(...parseWeekdayLine(trimmed));
+      }
+    }
+  }
+
+  return { locationName, parishName, address, phone, email, schedules };
+}
+
+// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────
+
+const NOISE_WORDS = new Set([
+  'church', 'parish', 'chapel', 'centre', 'center', 'mass',
+  'saint', 'st', 'our', 'lady', 'of', 'the', 'a', 'an',
+]);
+
+/**
+ * Normalise a church name for comparison:
+ * lowercase, strip accents, remove noise words, collapse whitespace.
+ */
+export function normalizeName(name: string): string {
+  return name
+    .toLowerCase()
+    .normalize('NFD').replace(/[\u0300-\u036f]/g, '')
+    .replace(/[^a-z0-9\s]/g, ' ')
+    .split(/\s+/)
+    .filter(w => w.length >= 2 && !NOISE_WORDS.has(w))
+    .join(' ')
+    .trim();
+}
+
+function wordOverlap(a: string, b: string): number {
+  const setA = new Set(a.split(' ').filter(Boolean));
+  const setB = new Set(b.split(' ').filter(Boolean));
+  if (setA.size === 0 || setB.size === 0) return 0;
+  let intersection = 0;
+  for (const w of setA) if (setB.has(w)) intersection++;
+  const union = setA.size + setB.size - intersection;
+  return intersection / union;
+}
+
+/**
+ * Find the best-matching existing church for a parsed entry.
+ * Returns null if no match meets the threshold.
+ */
+export function findMatch(
+  locationName: string,
+  address: string | null,
+  existing: ExistingChurch[]
+): ExistingChurch | null {
+  const normTarget = normalizeName(locationName);
+  let best: ExistingChurch | null = null;
+  let bestScore = 0;
+
+  for (const church of existing) {
+    const normExisting = normalizeName(church.name);
+    const score = wordOverlap(normTarget, normExisting);
+
+    if (score > bestScore) {
+      bestScore = score;
+      best = church;
+    }
+  }
+
+  if (bestScore >= 0.4) return best;
+
+  // Fallback: address prefix match (first 12 chars)
+  if (address && address.length >= 5) {
+    const addrPrefix = address.slice(0, 12).toLowerCase();
+    for (const church of existing) {
+      if (church.address?.toLowerCase().includes(addrPrefix)) return church;
+    }
+  }
+
+  return null;
+}
+
+// ─── DB Operations ────────────────────────────────────────────────────────────
+
+async function upsertChurch(
+  entry: ParsedEntry,
+  matched: ExistingChurch | null,
+  dryRun: boolean,
+  stats: ImportStats
+): Promise<void> {
+  const tag = matched ? `[MATCH] ${matched.name} ← ${entry.locationName}` : `[NEW]   ${entry.locationName}`;
+  const schedCount = entry.schedules.length;
+
+  if (dryRun) {
+    console.log(tag);
+    if (!matched && entry.address) console.log(`        Address: ${entry.address}`);
+    console.log(`        ${schedCount} schedules`);
+    if (matched) stats.matched++; else stats.created++;
+    stats.schedulesWritten += schedCount;
+    return;
+  }
+
+  if (matched) {
+    const update: Record<string, string> = {};
+    if (!matched.phone && entry.phone) update.phone = entry.phone;
+    if (!matched.email && entry.email) update.email = entry.email;
+
+    await prisma.$transaction(async tx => {
+      if (Object.keys(update).length > 0) {
+        await tx.church.update({ where: { id: matched.id }, data: update });
+      }
+      await tx.massSchedule.deleteMany({ where: { churchId: matched.id } });
+      if (entry.schedules.length > 0) {
+        await tx.massSchedule.createMany({
+          data: entry.schedules.map(s => ({
+            churchId: matched.id,
+            dayOfWeek: s.dayOfWeek,
+            time: s.time,
+            language: s.language,
+            notes: s.notes ?? null,
+          })),
+        });
+      }
+    });
+
+    stats.matched++;
+  } else {
+    const newChurch = await prisma.church.create({
+      data: {
+        name: entry.locationName,
+        country: 'HK',
+        source: 'diocese-hk',
+        address: entry.address ?? undefined,
+        phone: entry.phone ?? undefined,
+        email: entry.email ?? undefined,
+        latitude: 0,
+        longitude: 0,
+        hasWebsite: false,
+      },
+    });
+
+    if (entry.schedules.length > 0) {
+      await prisma.massSchedule.createMany({
+        data: entry.schedules.map(s => ({
+          churchId: newChurch.id,
+          dayOfWeek: s.dayOfWeek,
+          time: s.time,
+          language: s.language,
+          notes: s.notes ?? null,
+        })),
+      });
+    }
+
+    stats.created++;
+  }
+
+  stats.schedulesWritten += schedCount;
+  console.log(tag);
+}
+
+// ─── Main ─────────────────────────────────────────────────────────────────────
+
+async function main() {
+  const args = process.argv.slice(2);
+  const dryRun = args.includes('--dry-run');
+  const fileArgIdx = args.indexOf('--file');
+  const filePath = fileArgIdx >= 0 ? args[fileArgIdx + 1] : path.resolve(process.cwd(), 'scripts/hk-parishes.txt');
+
+  console.log(`\n${'='.repeat(60)}`);
+  console.log(`HK Diocese Parish Import`);
+  console.log(`File: ${filePath}`);
+  console.log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
+  console.log(`${'='.repeat(60)}\n`);
+
+  const raw = fs.readFileSync(filePath, 'utf-8');
+  const entryStrings = splitEntries(raw);
+  console.log(`Found ${entryStrings.length} entries in file\n`);
+
+  const existing = await prisma.church.findMany({
+    where: { country: 'HK' },
+    select: { id: true, name: true, address: true, phone: true, email: true },
+  });
+  console.log(`Loaded ${existing.length} existing HK churches\n`);
+
+  const stats: ImportStats = { matched: 0, created: 0, schedulesWritten: 0, skipped: 0 };
+
+  for (const entryStr of entryStrings) {
+    let entry: ParsedEntry;
+    try {
+      entry = parseEntry(entryStr);
+    } catch (err) {
+      console.warn(`[SKIP] Failed to parse entry: ${(err as Error).message}`);
+      stats.skipped++;
+      continue;
+    }
+
+    if (!entry.locationName || entry.locationName === 'Unknown') {
+      stats.skipped++;
+      continue;
+    }
+
+    const matched = findMatch(entry.locationName, entry.address, existing);
+    await upsertChurch(entry, matched, dryRun, stats);
+  }
+
+  console.log(`\n${'='.repeat(60)}`);
+  console.log(`Import Summary`);
+  console.log(`${'='.repeat(60)}`);
+  console.log(`Matched existing:   ${stats.matched}`);
+  console.log(`New churches:       ${stats.created}`);
+  console.log(`Skipped:            ${stats.skipped}`);
+  console.log(`Schedules written:  ${stats.schedulesWritten}`);
+  console.log(`${'='.repeat(60)}\n`);
+
+  await prisma.$disconnect();
+  await pool.end();
+}
+
+// Only run when executed directly (not imported by tests)
+import { fileURLToPath } from 'url';
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  main().catch(err => {
+    console.error('Fatal error:', err);
+    process.exit(1);
+  });
+}
Author	SHA1	Message	Date
albertfj114	92265cf27f	feat: add DB operations and CLI wiring for HK parish import upsertChurch() handles matched churches (replace schedules atomically via $transaction, update contact fields if null) and new churches (create with source='diocese-hk', lat/lng=0 for later geocoding). main() wires up CLI args, file reading, matching loop, and summary. Guards main() call with ESM import.meta.url check to prevent execution on import during tests. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 16:27:02 -04:00
albertfj114	8075072c24	fix: use true Jaccard similarity in wordOverlap (intersection/union) Replaces max(\|A\|,\|B\|) denominator with \|A∪B\| = \|A\|+\|B\|-intersection, which is the correct Jaccard formula and avoids inflating similarity when both name sets have significant unique words. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 16:25:24 -04:00
albertfj114	3ebbc3732f	feat: add name normalizer and church matcher for HK import normalizeName strips noise words (church/parish/chapel/etc), accents, and punctuation for robust name comparison. findMatch uses word-overlap Jaccard score (threshold 0.4) with address-prefix fallback for Chinese- named churches where English name overlap may be low. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 16:23:58 -04:00
albertfj114	eedb442e78	feat: add full entry parser for HK parishes parseEntry composes extractNames, extractFields, parseScheduleLine, and parseWeekdayLine into a single ParsedEntry. Routes schedule lines by section header (Sunday/Anticipated/Weekday) and skips Special Masses and Eucharist Adoration sections. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 16:18:05 -04:00
albertfj114	38274174a9	feat: add HK parish import parser functions (Tasks 2-6) Implements splitEntries, extractNames, extractFields, normalizeTime, parseScheduleLine, and parseWeekdayLine with 26 passing unit tests. Handles full-width parentheses, language tags, conditional schedule notes, day ranges, and comma-separated day/time lists. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 16:15:04 -04:00
albertfj114	328d146201	feat: add HK parish parser functions (Tasks 2-6) with tests Implements entry splitter, name extractor, field extractor, time normalizer, schedule line parser, and weekday day-prefix parser. All 26 tests pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 16:06:26 -04:00
albertfj114	9aea12f4b0	feat: add HK parish import script skeleton - Imports, types, and Prisma client init - ParsedSchedule and ParsedEntry types for parsing parish data - ExistingChurch interface for matching - ImportStats interface for tracking progress Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 15:59:51 -04:00