ScraperControl/scripts/import-hk-parishes.ts

#!/usr/bin/env tsx
/**
 * Import HK Diocese parish directory from plain-text paste.
 * Usage: npx tsx scripts/import-hk-parishes.ts [--dry-run] [--file scripts/hk-parishes.txt]
 */

import dotenv from 'dotenv';
import path from 'path';
import fs from 'fs';

dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });

import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';

const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
  connectionString: dbUrl,
  ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });

// ─── Types ────────────────────────────────────────────────────────────────────

export interface ParsedSchedule {
  dayOfWeek: number;   // 0=Sun, 1=Mon, ..., 6=Sat
  time: string;        // "HH:MM"
  language: string;    // "English" | "Cantonese" | "Tagalog"
  notes: string | null;
}

export interface ParsedEntry {
  locationName: string;
  parishName: string | null;
  address: string | null;
  phone: string | null;
  email: string | null;
  schedules: ParsedSchedule[];
}

interface ExistingChurch {
  id: string;
  name: string;
  address: string | null;
  phone: string | null;
  email: string | null;
}

interface ImportStats {
  matched: number;
  created: number;
  schedulesWritten: number;
  skipped: number;
}

// ─── Parser ───────────────────────────────────────────────────────────────────

const ARTIFACT_LINES = new Set(['share', 'path', 'close', '']);

const LANG_PATTERN = /(Cantonese|English|Tagalog|Chinese)/i;

// ─── Task 2: Entry splitter and name extractor ────────────────────────────────

/**
 * Split raw file text into individual entry strings.
 * Entries are delimited by "Path\nClose" which appears in every entry.
 * The header segment ("HONG KONG CHURCHES\n\n...") before the first entry is discarded.
 */
export function splitEntries(raw: string): string[] {
  const text = raw.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
  const parts = text.split('\nPath\nClose\n');
  const entries: string[] = [];
  for (let i = 1; i < parts.length; i++) {
    const pre = parts[i - 1];
    const body = parts[i];
    entries.push(pre + '\nPath\nClose\n' + body);
  }
  return entries;
}

/**
 * Extract location name and parish name from the pre-marker text of an entry.
 */
export function extractNames(preMarker: string): { locationName: string; parishName: string | null } {
  const lines = preMarker
    .split('\n')
    .map(l => l.trimEnd())
    .filter(l => {
      const lower = l.trim().toLowerCase();
      return !ARTIFACT_LINES.has(lower) && !l.startsWith(' ');
    })
    .filter(l => l.trim().length > 0);

  const nameLines = lines.slice(-2);
  if (nameLines.length === 0) return { locationName: 'Unknown', parishName: null };
  if (nameLines.length === 1) return { locationName: nameLines[0].trim(), parishName: null };
  return {
    locationName: nameLines[1].trim(),
    parishName: nameLines[0].trim(),
  };
}

// ─── Task 3: Field extractor ──────────────────────────────────────────────────

/**
 * Extract address, phone, email from the entry body (text after Path/Close).
 * Full-width parentheses （ ） are normalised to ASCII ( ).
 */
export function extractFields(body: string): { address: string | null; phone: string | null; email: string | null } {
  const normalise = (s: string) => s.replace(/（/g, '(').replace(/）/g, ')').trim();

  function extractField(fieldName: string): string | null {
    const regex = new RegExp(`\\b${fieldName}\\n([\\s\\S]*?)(?:\\n\\n|\\nFax|\\nEmail|\\nWebsite|\\nChurch|\\nParish|\\nAssistant|\\nDeacon|\\nSister|\\nChairperson|\\nResident|\\nRector|\\nP\\.C|\\nPastoral|\\nMass Time|$)`, 'i');
    const m = body.match(regex);
    if (!m) return null;
    const value = m[1].replace(/\n/g, ' ').trim();
    return value || null;
  }

  const address = extractField('Address');
  const rawPhone = extractField('Phone');
  const email = extractField('Email');

  return {
    address: address ? normalise(address) : null,
    phone: rawPhone ? normalise(rawPhone) : null,
    email: email || null,
  };
}

// ─── Task 4: Time normalizer ──────────────────────────────────────────────────

/**
 * Normalise a time string to "HH:MM" 24-hour format.
 * Accepts: "8:00am", "11:30 am", "7:00 a.m.", "12:00 noon", etc.
 * Returns null if no valid time found.
 */
export function normalizeTime(raw: string): string | null {
  const s = raw.trim().toLowerCase();
  if (s.includes('noon')) {
    if (s === 'noon') return '12:00';
    const m = s.match(/(\d{1,2}):(\d{2})\s*noon/);
    if (m) return `${String(parseInt(m[1], 10)).padStart(2, '0')}:${m[2]}`;
  }

  const m = s.match(/(\d{1,2}):(\d{2})\s*(am|pm|a\.m\.|p\.m\.)/);
  if (!m) return null;

  let h = parseInt(m[1], 10);
  const min = parseInt(m[2], 10);
  const period = m[3].replace(/\./g, '').toLowerCase();

  if (period === 'am') {
    if (h === 12) h = 0;
  } else {
    if (h !== 12) h += 12;
  }

  return `${String(h).padStart(2, '0')}:${String(min).padStart(2, '0')}`;
}

// ─── Task 5: Schedule line parser ────────────────────────────────────────────

const CONDITIONAL_PATTERN = /^([\w\s]+(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|month)[^:]*:)\s*/i;

/**
 * Parse a single schedule text line into 0-N ParsedSchedule records.
 * dayOfWeek: the fixed day for this section (0=Sun, 6=Sat for Anticipated).
 */
export function parseScheduleLine(line: string, dayOfWeek: number): ParsedSchedule[] {
  let remainder = line.trim();
  let language = 'English';
  let sectionNotes: string | null = null;

  // Extract language tag (with or without closing paren)
  const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
  if (langMatch) {
    const raw = langMatch[1].toLowerCase();
    language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
      : raw === 'tagalog' ? 'Tagalog'
      : 'English';
    remainder = remainder.replace(langMatch[0], '').trim();
  }

  // Strip "Saturday" / "on Saturday" anchors (Anticipated Sunday section)
  remainder = remainder.replace(/\bSaturday\b/gi, '').replace(/\bon\b/gi, '').trim();

  // Extract conditional note prefix
  const condMatch = remainder.match(CONDITIONAL_PATTERN);
  if (condMatch) {
    sectionNotes = condMatch[1].replace(/:$/, '').trim();
    remainder = remainder.slice(condMatch[0].length);
  }

  // Split by comma into time tokens
  const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
  const results: ParsedSchedule[] = [];

  for (const token of tokens) {
    const time = normalizeTime(token);
    if (!time) continue;

    // Anything in the token that isn't the time or period is a note
    const noteText = token
      .replace(/\d{1,2}:\d{2}\s*(am|pm|a\.m\.|p\.m\.|noon)/i, '')
      .replace(/\s+/g, ' ')
      .trim() || null;

    results.push({
      dayOfWeek,
      time,
      language,
      notes: noteText || sectionNotes,
    });
  }

  return results;
}

// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────

const DAY_ABBREV: Record<string, number> = {
  mon: 1, tue: 2, wed: 3, thur: 4, thu: 4, fri: 5, sat: 6, sun: 0,
};
const DAY_FULL: Record<string, number> = {
  monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6, sunday: 0,
};

function parseDays(prefix: string): number[] {
  const s = prefix.toLowerCase().replace(/\./g, '').replace(/:/g, '').trim();

  // Range: "monday to friday" or "mon to sat"
  const rangeMatch = s.match(/(\w+)\s+to\s+(\w+)/);
  if (rangeMatch) {
    const fromDay = DAY_FULL[rangeMatch[1]] ?? DAY_ABBREV[rangeMatch[1]];
    const toDay   = DAY_FULL[rangeMatch[2]] ?? DAY_ABBREV[rangeMatch[2]];
    if (fromDay !== undefined && toDay !== undefined) {
      const days: number[] = [];
      let d = fromDay;
      while (d !== toDay) { days.push(d); d = (d + 1) % 7; }
      days.push(toDay);
      return days;
    }
  }

  // List: "mon, tue, thur" or "tue & sat"
  const tokens = s.split(/[,&\s]+/).map(t => t.trim()).filter(Boolean);
  const days = tokens
    .map(t => DAY_FULL[t] ?? DAY_ABBREV[t])
    .filter((d): d is number => d !== undefined);
  return [...new Set(days)];
}

// Matches a day-prefix at the start of a token (requires trailing space/colon)
const DAY_PREFIX_RE = /^((?:(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*(?:[,&]\s*(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*)*(?:to\s+\w+\.?\s*)?)|(?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+to\s+\w+)?))[\s:]+/i;

// Matches a token that is ONLY a day (or day list) with no time — e.g. "Mon." "Tue."
const PURE_DAY_RE = /^((?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?|(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\.?$/i;

/**
 * Parse a weekday mass line that may have day prefixes.
 * Algorithm: split by comma, process each token; track current days across tokens.
 */
export function parseWeekdayLine(line: string): ParsedSchedule[] {
  let remainder = line.trim();
  let language = 'English';

  const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i);
  if (langMatch) {
    const raw = langMatch[1].toLowerCase();
    language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese'
      : raw === 'tagalog' ? 'Tagalog' : 'English';
    remainder = remainder.replace(langMatch[0], '').replace(/\s*\(\s*$/, '').trim();
  }

  const results: ParsedSchedule[] = [];
  const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean);
  let currentDays: number[] = [1, 2, 3, 4, 5]; // default Mon–Fri
  let accumulatedDays: number[] = []; // day-only tokens accumulate here until a time appears

  for (const token of tokens) {
    const prefixMatch = token.match(DAY_PREFIX_RE);
    if (prefixMatch) {
      const days = parseDays(prefixMatch[1]);
      const timePart = token.slice(prefixMatch[0].length);
      const time = normalizeTime(timePart);
      if (time) {
        // Merge any previously accumulated day-only tokens with this token's days
        const mergedDays = accumulatedDays.length > 0
          ? [...new Set([...accumulatedDays, ...days])]
          : days.length > 0 ? days : currentDays;
        accumulatedDays = [];
        if (mergedDays.length > 0) currentDays = mergedDays;
        for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
      } else {
        // Day-only token via prefix match: accumulate
        if (days.length > 0) accumulatedDays.push(...days);
      }
    } else if (PURE_DAY_RE.test(token)) {
      // Pure day token like "Mon." "Tue." "Tuesday" — accumulate
      const days = parseDays(token);
      if (days.length > 0) accumulatedDays.push(...days);
    } else {
      const time = normalizeTime(token);
      if (time) {
        // Apply any accumulated days, then reset
        if (accumulatedDays.length > 0) {
          currentDays = [...new Set(accumulatedDays)];
          accumulatedDays = [];
        }
        for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null });
      }
    }
  }

  return results;
}

// ─── Task 7: Full entry parser ────────────────────────────────────────────────

const SKIP_SECTIONS = new Set(['special masses', 'eucharist adoration']);

/**
 * Parse a full raw entry string (including pre-marker names) into a ParsedEntry.
 */
export function parseEntry(raw: string): ParsedEntry {
  const markerIdx = raw.indexOf('\nPath\nClose\n');
  const pre = markerIdx >= 0 ? raw.slice(0, markerIdx) : '';
  const body = markerIdx >= 0 ? raw.slice(markerIdx + '\nPath\nClose\n'.length) : raw;

  const { locationName, parishName } = extractNames(pre);
  const { address, phone, email } = extractFields(body);

  const schedules: ParsedSchedule[] = [];

  const massSectionMatch = body.match(/Mass Time\n([\s\S]*?)(?:Share\n|$)/i);
  if (massSectionMatch) {
    const massText = massSectionMatch[1];
    const lines = massText.split('\n');
    let currentSection: string | null = null;

    for (const line of lines) {
      const trimmed = line.trim();
      if (!trimmed) continue;

      const lower = trimmed.toLowerCase();

      if (lower === 'sunday masses') { currentSection = 'sunday'; continue; }
      if (lower === 'anticipated sunday masses') { currentSection = 'anticipated'; continue; }
      if (lower === 'weekday masses') { currentSection = 'weekday'; continue; }
      if (SKIP_SECTIONS.has(lower)) { currentSection = 'skip'; continue; }

      if (currentSection === 'skip') continue;
      if (currentSection === null) continue;

      if (currentSection === 'sunday') {
        schedules.push(...parseScheduleLine(trimmed, 0));
      } else if (currentSection === 'anticipated') {
        schedules.push(...parseScheduleLine(trimmed, 6));
      } else if (currentSection === 'weekday') {
        schedules.push(...parseWeekdayLine(trimmed));
      }
    }
  }

  return { locationName, parishName, address, phone, email, schedules };
}

// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────

const NOISE_WORDS = new Set([
  'church', 'parish', 'chapel', 'centre', 'center', 'mass',
  'saint', 'st', 'our', 'lady', 'of', 'the', 'a', 'an',
]);

/**
 * Normalise a church name for comparison:
 * lowercase, strip accents, remove noise words, collapse whitespace.
 */
export function normalizeName(name: string): string {
  return name
    .toLowerCase()
    .normalize('NFD').replace(/[\u0300-\u036f]/g, '')
    .replace(/[^a-z0-9\s]/g, ' ')
    .split(/\s+/)
    .filter(w => w.length >= 2 && !NOISE_WORDS.has(w))
    .join(' ')
    .trim();
}

function wordOverlap(a: string, b: string): number {
  const setA = new Set(a.split(' ').filter(Boolean));
  const setB = new Set(b.split(' ').filter(Boolean));
  if (setA.size === 0 || setB.size === 0) return 0;
  let common = 0;
  for (const w of setA) if (setB.has(w)) common++;
  return common / Math.max(setA.size, setB.size);
}

/**
 * Find the best-matching existing church for a parsed entry.
 * Returns null if no match meets the threshold.
 */
export function findMatch(
  locationName: string,
  address: string | null,
  existing: ExistingChurch[]
): ExistingChurch | null {
  const normTarget = normalizeName(locationName);
  let best: ExistingChurch | null = null;
  let bestScore = 0;

  for (const church of existing) {
    const normExisting = normalizeName(church.name);
    const score = wordOverlap(normTarget, normExisting);

    if (score > bestScore) {
      bestScore = score;
      best = church;
    }
  }

  if (bestScore >= 0.4) return best;

  // Fallback: address prefix match (first 12 chars)
  if (address && address.length >= 5) {
    const addrPrefix = address.slice(0, 12).toLowerCase();
    for (const church of existing) {
      if (church.address?.toLowerCase().includes(addrPrefix)) return church;
    }
  }

  return null;
}