ScraperControl/scripts/import-hk-parishes.test.ts

import { test } from 'node:test';
import assert from 'node:assert/strict';
import {
  splitEntries,
  extractNames,
  extractFields,
  normalizeTime,
  parseScheduleLine,
  parseWeekdayLine,
  parseEntry,
  normalizeName,
  findMatch,
} from './import-hk-parishes.js';

// ─── Task 2: Entry splitter and name extractor ────────────────────────────────

test('splitEntries splits on Path/Close boundary', () => {
  const raw = `HONG KONG CHURCHES\n\nParish A\nChurch A\nPath\nClose\nAddress\n1 Main St\n\nParish B\nChurch B\nPath\nClose\nAddress\n2 Side St\n`;
  const entries = splitEntries(raw);
  assert.equal(entries.length, 2);
  assert.ok(entries[0].includes('Church A'));
  assert.ok(entries[1].includes('Church B'));
});

test('extractNames returns locationName and parishName', () => {
  const pre = `Holy Cross Parish\nHOLY CROSS CHURCH`;
  const result = extractNames(pre);
  assert.equal(result.locationName, 'HOLY CROSS CHURCH');
  assert.equal(result.parishName, 'Holy Cross Parish');
});

test('extractNames strips Share and leading-space artifacts', () => {
  const pre = `Share\n Carmelite Monastery\nSt. Anne's Parish\nCarmelite Monastery`;
  const result = extractNames(pre);
  assert.equal(result.locationName, 'Carmelite Monastery');
  assert.equal(result.parishName, "St. Anne's Parish");
});

test('extractNames handles single name line', () => {
  const pre = `Cathedral Parish`;
  const result = extractNames(pre);
  assert.equal(result.locationName, 'Cathedral Parish');
  assert.equal(result.parishName, null);
});

// ─── Task 3: Field extractor ──────────────────────────────────────────────────

test('extractFields parses address, phone, email', () => {
  const body = `Address\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n（852）2560-1823\n\nFax\n（852）2535-8246\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\n`;
  const f = extractFields(body);
  assert.equal(f.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong');
  assert.equal(f.phone, '(852)2560-1823');
  assert.equal(f.email, 'holycrosshk@gmail.com');
});

test('extractFields handles missing fields gracefully', () => {
  const body = `Address\nSalesian School, 16 Chai Wan Road, Hong Kong.\n\nMass Time\n`;
  const f = extractFields(body);
  assert.equal(f.address, 'Salesian School, 16 Chai Wan Road, Hong Kong.');
  assert.equal(f.phone, null);
  assert.equal(f.email, null);
});

test('extractFields strips full-width parens from phone', () => {
  const body = `Phone\n（852）2819-5777, 2819-5845\n\n`;
  const f = extractFields(body);
  assert.equal(f.phone, '(852)2819-5777, 2819-5845');
});

// ─── Task 4: Time normalizer ──────────────────────────────────────────────────

test('normalizeTime handles am/pm with spaces', () => {
  assert.equal(normalizeTime('8:00am'), '08:00');
  assert.equal(normalizeTime('11:30 am'), '11:30');
  assert.equal(normalizeTime('6:00pm'), '18:00');
  assert.equal(normalizeTime('6:30 pm'), '18:30');
});

test('normalizeTime handles a.m./p.m. format', () => {
  assert.equal(normalizeTime('7:00 a.m.'), '07:00');
  assert.equal(normalizeTime('7:45 a.m.'), '07:45');
  assert.equal(normalizeTime('6:00 p.m.'), '18:00');
});

test('normalizeTime handles noon', () => {
  assert.equal(normalizeTime('12:00 noon'), '12:00');
  assert.equal(normalizeTime('12:30 pm'), '12:30');
});

test('normalizeTime handles 12:00am as midnight', () => {
  assert.equal(normalizeTime('12:00am'), '00:00');
});

test('normalizeTime returns null for unrecognised input', () => {
  assert.equal(normalizeTime('Monday'), null);
  assert.equal(normalizeTime(''), null);
});

// ─── Task 5: Schedule line parser ────────────────────────────────────────────

test('parseScheduleLine parses single time with language', () => {
  const results = parseScheduleLine('9:30am (English)', 0);
  assert.equal(results.length, 1);
  assert.deepEqual(results[0], { dayOfWeek: 0, time: '09:30', language: 'English', notes: null });
});

test('parseScheduleLine parses multiple comma-separated times', () => {
  const results = parseScheduleLine('8:00am,10:30 am (Cantonese)', 0);
  assert.equal(results.length, 2);
  assert.equal(results[0].time, '08:00');
  assert.equal(results[1].time, '10:30');
  assert.equal(results[1].language, 'Cantonese');
});

test('parseScheduleLine handles missing closing paren', () => {
  const results = parseScheduleLine('9:30 am (Cantonese', 0);
  assert.equal(results[0].language, 'Cantonese');
});

test('parseScheduleLine defaults language to English when not specified', () => {
  const results = parseScheduleLine('8:00am', 0);
  assert.equal(results[0].language, 'English');
});

test('parseScheduleLine stores embedded note text', () => {
  const results = parseScheduleLine('9:00 am Sunday School & Family Mass,11:30am (English)', 0);
  assert.equal(results.length, 2);
  assert.equal(results[0].time, '09:00');
  assert.equal(results[0].notes, 'Sunday School & Family Mass');
});

test('parseScheduleLine handles Saturday anticipated format variations', () => {
  const results = parseScheduleLine('Saturday 3:45 pm,Saturday 6:30 pm (Cantonese)', 6);
  assert.equal(results.length, 2);
  assert.equal(results[0].time, '15:45');
  assert.equal(results[1].time, '18:30');
});

test('parseScheduleLine handles "on Saturday" suffix format', () => {
  const results = parseScheduleLine('6:00pm on Saturday (Cantonese)', 6);
  assert.equal(results.length, 1);
  assert.equal(results[0].time, '18:00');
  assert.equal(results[0].language, 'Cantonese');
});

test('parseScheduleLine handles conditional prefix as notes', () => {
  const results = parseScheduleLine('5th Sunday of the month: 7:15 am (Tagalog)', 0);
  assert.equal(results.length, 1);
  assert.equal(results[0].time, '07:15');
  assert.equal(results[0].language, 'Tagalog');
  assert.ok(results[0].notes?.includes('5th Sunday'));
});

// ─── Task 6: Weekday day-prefix parser ───────────────────────────────────────

test('parseWeekdayLine no prefix = all weekdays Mon-Fri', () => {
  const results = parseWeekdayLine('7:15 am (Cantonese)');
  assert.equal(results.length, 5);
  assert.ok(results.every(r => r.time === '07:15'));
  assert.ok(results.every(r => r.language === 'Cantonese'));
  assert.deepEqual(results.map(r => r.dayOfWeek), [1, 2, 3, 4, 5]);
});

test('parseWeekdayLine abbreviation list', () => {
  const results = parseWeekdayLine('Mon., Tue., Thur. 8:00 a.m. (Cantonese)');
  assert.equal(results.length, 3);
  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 4]);
});

test('parseWeekdayLine abbreviation range Mon. to Sat.', () => {
  const results = parseWeekdayLine('Mon. to Sat. 9:15 am (English)');
  assert.equal(results.length, 6);
  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 3, 4, 5, 6]);
});

test('parseWeekdayLine full-word range Monday to Friday', () => {
  const results = parseWeekdayLine('Monday to Friday: 12:00 noon (English)');
  assert.equal(results.length, 5);
  assert.ok(results.every(r => r.time === '12:00'));
});

test('parseWeekdayLine ampersand separator', () => {
  const results = parseWeekdayLine('Tue., Thur. & Sat. 9:45 a.m. (Cantonese)');
  assert.equal(results.length, 3);
  assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [2, 4, 6]);
});

test('parseWeekdayLine multiple time groups on one line', () => {
  const results = parseWeekdayLine('Monday to Saturday: 7:45 am,Monday to Friday: 12:00 noon,Monday to Friday: 6:00 pm (English)');
  assert.equal(results.length, 16);
});

// ─── Task 7: Full entry parser ────────────────────────────────────────────────

test('parseEntry extracts names, fields, and schedules from a full entry', () => {
  const raw = `Holy Cross Parish\nHOLY CROSS CHURCH\nPath\nClose\nAddress\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n（852）2560-1823\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\nSunday Masses\n8:00am,9:30am (Cantonese)\n1:00 pm (English)\n\nAnticipated Sunday Masses\nSaturday 3:45 pm,Saturday 6:30 pm (Cantonese)\n\nWeekday Masses\n7:15 am (Cantonese)\n\nSpecial Masses\nSomething irrelevant\n`;
  const entry = parseEntry(raw);
  assert.equal(entry.locationName, 'HOLY CROSS CHURCH');
  assert.equal(entry.parishName, 'Holy Cross Parish');
  assert.equal(entry.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong');
  assert.equal(entry.phone, '(852)2560-1823');
  assert.equal(entry.email, 'holycrosshk@gmail.com');
  // Sunday: 2 Cantonese + 1 English = 3 entries
  const sunday = entry.schedules.filter(s => s.dayOfWeek === 0);
  assert.equal(sunday.length, 3);
  // Anticipated (Saturday): 2 entries
  const saturday = entry.schedules.filter(s => s.dayOfWeek === 6);
  assert.equal(saturday.length, 2);
  // Weekday: 5 entries (Mon–Fri)
  const weekday = entry.schedules.filter(s => s.dayOfWeek >= 1 && s.dayOfWeek <= 5);
  assert.equal(weekday.length, 5);
});

// ─── Task 8: Name normalizer + matcher ───────────────────────────────────────

test('normalizeName strips noise words and lowercases', () => {
  assert.equal(normalizeName('HOLY CROSS CHURCH'), 'holy cross');
  assert.equal(normalizeName('Our Lady Of Mount Carmel Church'), 'mount carmel');
  assert.equal(normalizeName("St. Joseph's Parish"), 'joseph');
  assert.equal(normalizeName('Salesian Mass Centre'), 'salesian');
});

test('findMatch matches by name overlap', () => {
  const existing = [
    { id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null },
    { id: '2', name: 'St Joseph (Central)', address: '37 Garden Road', phone: null, email: null },
  ];
  assert.equal(findMatch('HOLY CROSS CHURCH', '1 Holy Cross Path', existing)?.id, '1');
  assert.equal(findMatch("St. Joseph's Church", '37 Garden Road', existing)?.id, '2');
});

test('findMatch falls back to address prefix match', () => {
  const existing = [
    { id: '3', name: '聖母聖衣堂 (Our Lady of Mount Carmel Wanchai)', address: 'No.1, Star Street', phone: null, email: null },
  ];
  assert.equal(findMatch('Our Lady Of Mount Carmel Church', 'No.1, Star Street, Wan Chai', existing)?.id, '3');
});

test('findMatch returns null for no match', () => {
  const existing = [
    { id: '1', name: 'Holy Cross (Sai Wan Ho)', address: '1 Holy Cross Path', phone: null, email: null },
  ];
  assert.equal(findMatch('Salesian Mass Centre', 'Salesian School, 16 Chai Wan Road', existing), null);
});