feat: add HK parish parser functions (Tasks 2-6) with tests

Implements entry splitter, name extractor, field extractor, time normalizer, schedule line parser, and weekday day-prefix parser. All 26 tests pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 16:06:26 -04:00
parent 9aea12f4b0
commit 328d146201
2 changed files with 447 additions and 0 deletions
--- a/scripts/import-hk-parishes.ts
+++ b/scripts/import-hk-parishes.ts
@@ -56,3 +56,262 @@ interface ImportStats {
  schedulesWritten: number;
  skipped: number;
 }
+
+// ─── Parser ───────────────────────────────────────────────────────────────────
+
+const ARTIFACT_LINES = new Set(['share', 'path', 'close', '']);
+
+const LANG_PATTERN = /(Cantonese|English|Tagalog|Chinese)/i;
+
+// ─── Task 2: Entry splitter and name extractor ────────────────────────────────
+
+/**
+ * Split raw file text into individual entry strings.
+ * Entries are delimited by "Path\nClose" which appears in every entry.
+ * The header segment ("HONG KONG CHURCHES\n\n...") before the first entry is discarded.
+ */
+export function splitEntries(raw: string): string[] {
+  const text = raw.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
+  const parts = text.split('\nPath\nClose\n');
+  const entries: string[] = [];
+  for (let i = 1; i < parts.length; i++) {
+    const pre = parts[i - 1];
+    const body = parts[i];
+    entries.push(pre + '\nPath\nClose\n' + body);
+  }
+  return entries;
+}
+
+/**
+ * Extract location name and parish name from the pre-marker text of an entry.
+ */
+export function extractNames(preMarker: string): { locationName: string; parishName: string | null } {
+  const lines = preMarker
+    .split('\n')
+    .map(l => l.trimEnd())
+    .filter(l => {
+      const lower = l.trim().toLowerCase();
+      return !ARTIFACT_LINES.has(lower) && !l.startsWith(' ');
+    })
+    .filter(l => l.trim().length > 0);
+
+  const nameLines = lines.slice(-2);
+  if (nameLines.length === 0) return { locationName: 'Unknown', parishName: null };
+  if (nameLines.length === 1) return { locationName: nameLines[0].trim(), parishName: null };
+  return {
+    locationName: nameLines[1].trim(),
+    parishName: nameLines[0].trim(),
+  };
+}
+
+// ─── Task 3: Field extractor ──────────────────────────────────────────────────
+
+/**
+ * Extract address, phone, email from the entry body (text after Path/Close).
+ * Full-width parentheses （ ） are normalised to ASCII ( ).
+ */
+export function extractFields(body: string): { address: string | null; phone: string | null; email: string | null } {
+  const normalise = (s: string) => s.replace(/（/g, '(').replace(/）/g, ')').trim();
+
+  function extractField(fieldName: string): string | null {
+    const regex = new RegExp(`\\b${fieldName}\\n([\\s\\S]*?)(?:\\n\\n|\\nFax|\\nEmail|\\nWebsite|\\nChurch|\\nParish|\\nAssistant|\\nDeacon|\\nResident|\\nRector|\\nP\\.C|\\nPastoral|\\nMass Time|$)`, 'i');
+    const m = body.match(regex);
+    if (!m) return null;
+    const value = m[1].replace(/\n/g, ' ').trim();
+    return value || null;
+  }
+
+  const address = extractField('Address');
+  const rawPhone = extractField('Phone');
+  const email = extractField('Email');
+
+  return {
+    address: address ? normalise(address) : null,
+    phone: rawPhone ? normalise(rawPhone) : null,
+    email: email || null,
+  };
+}
+
+// ─── Task 4: Time normalizer ──────────────────────────────────────────────────
+
+/**
+ * Normalise a time string to "HH:MM" 24-hour format.
+ * Accepts: "8:00am", "11:30 am", "7:00 a.m.", "12:00 noon", etc.
+ * Returns null if no valid time found.
+ */
+export function normalizeTime(raw: string): string | null {
+  const s = raw.trim().toLowerCase();
+  if (s.includes('noon')) {
+    const m = s.match(/(\d{1,2}):(\d{2})\s*noon/);
+    if (m) return `${String(parseInt(m[1], 10)).padStart(2, '0')}:${m[2]}`;
+    if (s === '12:00 noon' || s === '12:00noon') return '12:00';
+  }
+
+  const m = s.match(/(\d{1,2}):(\d{2})\s*(am|pm|a\.m\.|p\.m\.)/);
+  if (!m) return null;
+
+  let h = parseInt(m[1], 10);
+  const min = parseInt(m[2], 10);
+  const period = m[3].replace(/\./g, '').toLowerCase();
+
+  if (period === 'am') {
+    if (h === 12) h = 0;
+  } else {
+    if (h !== 12) h += 12;
+  }
+
+  return `${String(h).padStart(2, '0')}:${String(min).padStart(2, '0')}`;
+}
+
+// ─── Task 5: Schedule line parser ────────────────────────────────────────────
+
+const CONDITIONAL_PATTERN = /^([\w\s]+(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|month)[^:]*:)\s*/i;
+
+/**
+ * Parse a single schedule text line into 0-N ParsedSchedule records.
+ * dayOfWeek: the fixed day for this section (0=Sun, 6=Sat for Anticipated).
+ */
+export function parseScheduleLine(line: string, dayOfWeek: number): ParsedSchedule[] {
+  let remainder = line.trim();
+  let language = 'English';
+  let sectionNotes: string | null = null;
+
+  // Extract language tag (with or without closing paren)
+  const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
+  if (langMatch) {
+    const raw = langMatch[1].toLowerCase();
+    language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
+      : raw === 'tagalog' ? 'Tagalog'
+      : 'English';
+    remainder = remainder.replace(langMatch[0], '').trim();
+  }
+
+  // Strip "Saturday" / "on Saturday" anchors (Anticipated Sunday section)
+  remainder = remainder.replace(/\bSaturday\b/gi, '').replace(/\bon\b/gi, '').trim();
+
+  // Extract conditional note prefix
+  const condMatch = remainder.match(CONDITIONAL_PATTERN);
+  if (condMatch) {
+    sectionNotes = condMatch[1].replace(/:$/, '').trim();
+    remainder = remainder.slice(condMatch[0].length);
+  }
+
+  // Split by comma into time tokens
+  const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
+  const results: ParsedSchedule[] = [];
+
+  for (const token of tokens) {
+    const time = normalizeTime(token);
+    if (!time) continue;
+
+    // Anything in the token that isn't the time or period is a note
+    const noteText = token
+      .replace(/\d{1,2}:\d{2}\s*(am|pm|a\.m\.|p\.m\.|noon)/i, '')
+      .replace(/\s+/g, ' ')
+      .trim() || null;
+
+    results.push({
+      dayOfWeek,
+      time,
+      language,
+      notes: noteText || sectionNotes,
+    });
+  }
+
+  return results;
+}
+
+// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────
+
+const DAY_ABBREV: Record<string, number> = {
+  mon: 1, tue: 2, wed: 3, thur: 4, thu: 4, fri: 5, sat: 6, sun: 0,
+};
+const DAY_FULL: Record<string, number> = {
+  monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6, sunday: 0,
+};
+
+function parseDays(prefix: string): number[] {
+  const s = prefix.toLowerCase().replace(/\./g, '').replace(/:/g, '').trim();
+
+  // Range: "monday to friday" or "mon to sat"
+  const rangeMatch = s.match(/(\w+)\s+to\s+(\w+)/);
+  if (rangeMatch) {
+    const fromDay = DAY_FULL[rangeMatch[1]] ?? DAY_ABBREV[rangeMatch[1]];
+    const toDay   = DAY_FULL[rangeMatch[2]] ?? DAY_ABBREV[rangeMatch[2]];
+    if (fromDay !== undefined && toDay !== undefined) {
+      return Array.from({ length: toDay - fromDay + 1 }, (_, i) => fromDay + i);
+    }
+  }
+
+  // List: "mon, tue, thur" or "tue & sat"
+  const tokens = s.split(/[,&\s]+/).map(t => t.trim()).filter(Boolean);
+  const days = tokens
+    .map(t => DAY_FULL[t] ?? DAY_ABBREV[t])
+    .filter((d): d is number => d !== undefined);
+  return [...new Set(days)];
+}
+
+// Matches a day-prefix at the start of a token (requires trailing space/colon)
+const DAY_PREFIX_RE = /^((?:(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*(?:[,&]\s*(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*)*(?:to\s+\w+\.?\s*)?)|(?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+to\s+\w+)?))[\s:]+/i;
+
+// Matches a token that is ONLY a day (or day list) with no time — e.g. "Mon." "Tue."
+const PURE_DAY_RE = /^((?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?|(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\.?$/i;
+
+/**
+ * Parse a weekday mass line that may have day prefixes.
+ * Algorithm: split by comma, process each token; track current days across tokens.
+ */
+export function parseWeekdayLine(line: string): ParsedSchedule[] {
+  let remainder = line.trim();
+  let language = 'English';
+
+  const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
+  if (langMatch) {
+    const raw = langMatch[1].toLowerCase();
+    language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
+      : raw === 'tagalog' ? 'Tagalog' : 'English';
+    remainder = remainder.replace(langMatch[0], '').replace(/\s*\(\s*$/, '').trim();
+  }
+
+  const results: ParsedSchedule[] = [];
+  const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
+  let currentDays: number[] = [1, 2, 3, 4, 5]; // default Mon–Fri
+  let accumulatedDays: number[] = []; // day-only tokens accumulate here until a time appears
+
+  for (const token of tokens) {
+    const prefixMatch = token.match(DAY_PREFIX_RE);
+    if (prefixMatch) {
+      const days = parseDays(prefixMatch[1]);
+      const timePart = token.slice(prefixMatch[0].length);
+      const time = normalizeTime(timePart);
+      if (time) {
+        // Merge any previously accumulated day-only tokens with this token's days
+        const mergedDays = accumulatedDays.length > 0
+          ? [...new Set([...accumulatedDays, ...days])]
+          : days.length > 0 ? days : currentDays;
+        accumulatedDays = [];
+        if (mergedDays.length > 0) currentDays = mergedDays;
+        for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
+      } else {
+        // Day-only token via prefix match: accumulate
+        if (days.length > 0) accumulatedDays.push(...days);
+      }
+    } else if (PURE_DAY_RE.test(token)) {
+      // Pure day token like "Mon." "Tue." "Tuesday" — accumulate
+      const days = parseDays(token);
+      if (days.length > 0) accumulatedDays.push(...days);
+    } else {
+      const time = normalizeTime(token);
+      if (time) {
+        // Apply any accumulated days, then reset
+        if (accumulatedDays.length > 0) {
+          currentDays = [...new Set(accumulatedDays)];
+          accumulatedDays = [];
+        }
+        for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
+      }
+    }
+  }
+
+  return results;
+}