diff --git a/scripts/import-hk-parishes.test.ts b/scripts/import-hk-parishes.test.ts new file mode 100644 index 0000000..9924e34 --- /dev/null +++ b/scripts/import-hk-parishes.test.ts @@ -0,0 +1,188 @@ +import { test } from 'node:test'; +import assert from 'node:assert/strict'; +import { + splitEntries, + extractNames, + extractFields, + normalizeTime, + parseScheduleLine, + parseWeekdayLine, +} from './import-hk-parishes.js'; + +// ─── Task 2: Entry splitter and name extractor ──────────────────────────────── + +test('splitEntries splits on Path/Close boundary', () => { + const raw = `HONG KONG CHURCHES\n\nParish A\nChurch A\nPath\nClose\nAddress\n1 Main St\n\nParish B\nChurch B\nPath\nClose\nAddress\n2 Side St\n`; + const entries = splitEntries(raw); + assert.equal(entries.length, 2); + assert.ok(entries[0].includes('Church A')); + assert.ok(entries[1].includes('Church B')); +}); + +test('extractNames returns locationName and parishName', () => { + const pre = `Holy Cross Parish\nHOLY CROSS CHURCH`; + const result = extractNames(pre); + assert.equal(result.locationName, 'HOLY CROSS CHURCH'); + assert.equal(result.parishName, 'Holy Cross Parish'); +}); + +test('extractNames strips Share and leading-space artifacts', () => { + const pre = `Share\n Carmelite Monastery\nSt. Anne's Parish\nCarmelite Monastery`; + const result = extractNames(pre); + assert.equal(result.locationName, 'Carmelite Monastery'); + assert.equal(result.parishName, "St. Anne's Parish"); +}); + +test('extractNames handles single name line', () => { + const pre = `Cathedral Parish`; + const result = extractNames(pre); + assert.equal(result.locationName, 'Cathedral Parish'); + assert.equal(result.parishName, null); +}); + +// ─── Task 3: Field extractor ────────────────────────────────────────────────── + +test('extractFields parses address, phone, email', () => { + const body = `Address\n1 Holy Cross Path, Shau Kei Wan, Hong Kong\n\nPhone\n(852)2560-1823\n\nFax\n(852)2535-8246\n\nEmail\nholycrosshk@gmail.com\n\nWebsite\nClick Here\n\nMass Time\n`; + const f = extractFields(body); + assert.equal(f.address, '1 Holy Cross Path, Shau Kei Wan, Hong Kong'); + assert.equal(f.phone, '(852)2560-1823'); + assert.equal(f.email, 'holycrosshk@gmail.com'); +}); + +test('extractFields handles missing fields gracefully', () => { + const body = `Address\nSalesian School, 16 Chai Wan Road, Hong Kong.\n\nMass Time\n`; + const f = extractFields(body); + assert.equal(f.address, 'Salesian School, 16 Chai Wan Road, Hong Kong.'); + assert.equal(f.phone, null); + assert.equal(f.email, null); +}); + +test('extractFields strips full-width parens from phone', () => { + const body = `Phone\n(852)2819-5777, 2819-5845\n\n`; + const f = extractFields(body); + assert.equal(f.phone, '(852)2819-5777, 2819-5845'); +}); + +// ─── Task 4: Time normalizer ────────────────────────────────────────────────── + +test('normalizeTime handles am/pm with spaces', () => { + assert.equal(normalizeTime('8:00am'), '08:00'); + assert.equal(normalizeTime('11:30 am'), '11:30'); + assert.equal(normalizeTime('6:00pm'), '18:00'); + assert.equal(normalizeTime('6:30 pm'), '18:30'); +}); + +test('normalizeTime handles a.m./p.m. format', () => { + assert.equal(normalizeTime('7:00 a.m.'), '07:00'); + assert.equal(normalizeTime('7:45 a.m.'), '07:45'); + assert.equal(normalizeTime('6:00 p.m.'), '18:00'); +}); + +test('normalizeTime handles noon', () => { + assert.equal(normalizeTime('12:00 noon'), '12:00'); + assert.equal(normalizeTime('12:30 pm'), '12:30'); +}); + +test('normalizeTime handles 12:00am as midnight', () => { + assert.equal(normalizeTime('12:00am'), '00:00'); +}); + +test('normalizeTime returns null for unrecognised input', () => { + assert.equal(normalizeTime('Monday'), null); + assert.equal(normalizeTime(''), null); +}); + +// ─── Task 5: Schedule line parser ──────────────────────────────────────────── + +test('parseScheduleLine parses single time with language', () => { + const results = parseScheduleLine('9:30am (English)', 0); + assert.equal(results.length, 1); + assert.deepEqual(results[0], { dayOfWeek: 0, time: '09:30', language: 'English', notes: null }); +}); + +test('parseScheduleLine parses multiple comma-separated times', () => { + const results = parseScheduleLine('8:00am,10:30 am (Cantonese)', 0); + assert.equal(results.length, 2); + assert.equal(results[0].time, '08:00'); + assert.equal(results[1].time, '10:30'); + assert.equal(results[1].language, 'Cantonese'); +}); + +test('parseScheduleLine handles missing closing paren', () => { + const results = parseScheduleLine('9:30 am (Cantonese', 0); + assert.equal(results[0].language, 'Cantonese'); +}); + +test('parseScheduleLine defaults language to English when not specified', () => { + const results = parseScheduleLine('8:00am', 0); + assert.equal(results[0].language, 'English'); +}); + +test('parseScheduleLine stores embedded note text', () => { + const results = parseScheduleLine('9:00 am Sunday School & Family Mass,11:30am (English)', 0); + assert.equal(results.length, 2); + assert.equal(results[0].time, '09:00'); + assert.equal(results[0].notes, 'Sunday School & Family Mass'); +}); + +test('parseScheduleLine handles Saturday anticipated format variations', () => { + const results = parseScheduleLine('Saturday 3:45 pm,Saturday 6:30 pm (Cantonese)', 6); + assert.equal(results.length, 2); + assert.equal(results[0].time, '15:45'); + assert.equal(results[1].time, '18:30'); +}); + +test('parseScheduleLine handles "on Saturday" suffix format', () => { + const results = parseScheduleLine('6:00pm on Saturday (Cantonese)', 6); + assert.equal(results.length, 1); + assert.equal(results[0].time, '18:00'); + assert.equal(results[0].language, 'Cantonese'); +}); + +test('parseScheduleLine handles conditional prefix as notes', () => { + const results = parseScheduleLine('5th Sunday of the month: 7:15 am (Tagalog)', 0); + assert.equal(results.length, 1); + assert.equal(results[0].time, '07:15'); + assert.equal(results[0].language, 'Tagalog'); + assert.ok(results[0].notes?.includes('5th Sunday')); +}); + +// ─── Task 6: Weekday day-prefix parser ─────────────────────────────────────── + +test('parseWeekdayLine no prefix = all weekdays Mon-Fri', () => { + const results = parseWeekdayLine('7:15 am (Cantonese)'); + assert.equal(results.length, 5); + assert.ok(results.every(r => r.time === '07:15')); + assert.ok(results.every(r => r.language === 'Cantonese')); + assert.deepEqual(results.map(r => r.dayOfWeek), [1, 2, 3, 4, 5]); +}); + +test('parseWeekdayLine abbreviation list', () => { + const results = parseWeekdayLine('Mon., Tue., Thur. 8:00 a.m. (Cantonese)'); + assert.equal(results.length, 3); + assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 4]); +}); + +test('parseWeekdayLine abbreviation range Mon. to Sat.', () => { + const results = parseWeekdayLine('Mon. to Sat. 9:15 am (English)'); + assert.equal(results.length, 6); + assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [1, 2, 3, 4, 5, 6]); +}); + +test('parseWeekdayLine full-word range Monday to Friday', () => { + const results = parseWeekdayLine('Monday to Friday: 12:00 noon (English)'); + assert.equal(results.length, 5); + assert.ok(results.every(r => r.time === '12:00')); +}); + +test('parseWeekdayLine ampersand separator', () => { + const results = parseWeekdayLine('Tue., Thur. & Sat. 9:45 a.m. (Cantonese)'); + assert.equal(results.length, 3); + assert.deepEqual(results.map(r => r.dayOfWeek).sort(), [2, 4, 6]); +}); + +test('parseWeekdayLine multiple time groups on one line', () => { + const results = parseWeekdayLine('Monday to Saturday: 7:45 am,Monday to Friday: 12:00 noon,Monday to Friday: 6:00 pm (English)'); + assert.equal(results.length, 16); +}); diff --git a/scripts/import-hk-parishes.ts b/scripts/import-hk-parishes.ts index b072087..ba7d712 100644 --- a/scripts/import-hk-parishes.ts +++ b/scripts/import-hk-parishes.ts @@ -56,3 +56,262 @@ interface ImportStats { schedulesWritten: number; skipped: number; } + +// ─── Parser ─────────────────────────────────────────────────────────────────── + +const ARTIFACT_LINES = new Set(['share', 'path', 'close', '']); + +const LANG_PATTERN = /(Cantonese|English|Tagalog|Chinese)/i; + +// ─── Task 2: Entry splitter and name extractor ──────────────────────────────── + +/** + * Split raw file text into individual entry strings. + * Entries are delimited by "Path\nClose" which appears in every entry. + * The header segment ("HONG KONG CHURCHES\n\n...") before the first entry is discarded. + */ +export function splitEntries(raw: string): string[] { + const text = raw.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); + const parts = text.split('\nPath\nClose\n'); + const entries: string[] = []; + for (let i = 1; i < parts.length; i++) { + const pre = parts[i - 1]; + const body = parts[i]; + entries.push(pre + '\nPath\nClose\n' + body); + } + return entries; +} + +/** + * Extract location name and parish name from the pre-marker text of an entry. + */ +export function extractNames(preMarker: string): { locationName: string; parishName: string | null } { + const lines = preMarker + .split('\n') + .map(l => l.trimEnd()) + .filter(l => { + const lower = l.trim().toLowerCase(); + return !ARTIFACT_LINES.has(lower) && !l.startsWith(' '); + }) + .filter(l => l.trim().length > 0); + + const nameLines = lines.slice(-2); + if (nameLines.length === 0) return { locationName: 'Unknown', parishName: null }; + if (nameLines.length === 1) return { locationName: nameLines[0].trim(), parishName: null }; + return { + locationName: nameLines[1].trim(), + parishName: nameLines[0].trim(), + }; +} + +// ─── Task 3: Field extractor ────────────────────────────────────────────────── + +/** + * Extract address, phone, email from the entry body (text after Path/Close). + * Full-width parentheses ( ) are normalised to ASCII ( ). + */ +export function extractFields(body: string): { address: string | null; phone: string | null; email: string | null } { + const normalise = (s: string) => s.replace(/(/g, '(').replace(/)/g, ')').trim(); + + function extractField(fieldName: string): string | null { + const regex = new RegExp(`\\b${fieldName}\\n([\\s\\S]*?)(?:\\n\\n|\\nFax|\\nEmail|\\nWebsite|\\nChurch|\\nParish|\\nAssistant|\\nDeacon|\\nResident|\\nRector|\\nP\\.C|\\nPastoral|\\nMass Time|$)`, 'i'); + const m = body.match(regex); + if (!m) return null; + const value = m[1].replace(/\n/g, ' ').trim(); + return value || null; + } + + const address = extractField('Address'); + const rawPhone = extractField('Phone'); + const email = extractField('Email'); + + return { + address: address ? normalise(address) : null, + phone: rawPhone ? normalise(rawPhone) : null, + email: email || null, + }; +} + +// ─── Task 4: Time normalizer ────────────────────────────────────────────────── + +/** + * Normalise a time string to "HH:MM" 24-hour format. + * Accepts: "8:00am", "11:30 am", "7:00 a.m.", "12:00 noon", etc. + * Returns null if no valid time found. + */ +export function normalizeTime(raw: string): string | null { + const s = raw.trim().toLowerCase(); + if (s.includes('noon')) { + const m = s.match(/(\d{1,2}):(\d{2})\s*noon/); + if (m) return `${String(parseInt(m[1], 10)).padStart(2, '0')}:${m[2]}`; + if (s === '12:00 noon' || s === '12:00noon') return '12:00'; + } + + const m = s.match(/(\d{1,2}):(\d{2})\s*(am|pm|a\.m\.|p\.m\.)/); + if (!m) return null; + + let h = parseInt(m[1], 10); + const min = parseInt(m[2], 10); + const period = m[3].replace(/\./g, '').toLowerCase(); + + if (period === 'am') { + if (h === 12) h = 0; + } else { + if (h !== 12) h += 12; + } + + return `${String(h).padStart(2, '0')}:${String(min).padStart(2, '0')}`; +} + +// ─── Task 5: Schedule line parser ──────────────────────────────────────────── + +const CONDITIONAL_PATTERN = /^([\w\s]+(?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|month)[^:]*:)\s*/i; + +/** + * Parse a single schedule text line into 0-N ParsedSchedule records. + * dayOfWeek: the fixed day for this section (0=Sun, 6=Sat for Anticipated). + */ +export function parseScheduleLine(line: string, dayOfWeek: number): ParsedSchedule[] { + let remainder = line.trim(); + let language = 'English'; + let sectionNotes: string | null = null; + + // Extract language tag (with or without closing paren) + const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i); + if (langMatch) { + const raw = langMatch[1].toLowerCase(); + language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese' + : raw === 'tagalog' ? 'Tagalog' + : 'English'; + remainder = remainder.replace(langMatch[0], '').trim(); + } + + // Strip "Saturday" / "on Saturday" anchors (Anticipated Sunday section) + remainder = remainder.replace(/\bSaturday\b/gi, '').replace(/\bon\b/gi, '').trim(); + + // Extract conditional note prefix + const condMatch = remainder.match(CONDITIONAL_PATTERN); + if (condMatch) { + sectionNotes = condMatch[1].replace(/:$/, '').trim(); + remainder = remainder.slice(condMatch[0].length); + } + + // Split by comma into time tokens + const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean); + const results: ParsedSchedule[] = []; + + for (const token of tokens) { + const time = normalizeTime(token); + if (!time) continue; + + // Anything in the token that isn't the time or period is a note + const noteText = token + .replace(/\d{1,2}:\d{2}\s*(am|pm|a\.m\.|p\.m\.|noon)/i, '') + .replace(/\s+/g, ' ') + .trim() || null; + + results.push({ + dayOfWeek, + time, + language, + notes: noteText || sectionNotes, + }); + } + + return results; +} + +// ─── Task 6: Weekday day-prefix parser ─────────────────────────────────────── + +const DAY_ABBREV: Record = { + mon: 1, tue: 2, wed: 3, thur: 4, thu: 4, fri: 5, sat: 6, sun: 0, +}; +const DAY_FULL: Record = { + monday: 1, tuesday: 2, wednesday: 3, thursday: 4, friday: 5, saturday: 6, sunday: 0, +}; + +function parseDays(prefix: string): number[] { + const s = prefix.toLowerCase().replace(/\./g, '').replace(/:/g, '').trim(); + + // Range: "monday to friday" or "mon to sat" + const rangeMatch = s.match(/(\w+)\s+to\s+(\w+)/); + if (rangeMatch) { + const fromDay = DAY_FULL[rangeMatch[1]] ?? DAY_ABBREV[rangeMatch[1]]; + const toDay = DAY_FULL[rangeMatch[2]] ?? DAY_ABBREV[rangeMatch[2]]; + if (fromDay !== undefined && toDay !== undefined) { + return Array.from({ length: toDay - fromDay + 1 }, (_, i) => fromDay + i); + } + } + + // List: "mon, tue, thur" or "tue & sat" + const tokens = s.split(/[,&\s]+/).map(t => t.trim()).filter(Boolean); + const days = tokens + .map(t => DAY_FULL[t] ?? DAY_ABBREV[t]) + .filter((d): d is number => d !== undefined); + return [...new Set(days)]; +} + +// Matches a day-prefix at the start of a token (requires trailing space/colon) +const DAY_PREFIX_RE = /^((?:(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*(?:[,&]\s*(?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?\s*)*(?:to\s+\w+\.?\s*)?)|(?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+to\s+\w+)?))[\s:]+/i; + +// Matches a token that is ONLY a day (or day list) with no time — e.g. "Mon." "Tue." +const PURE_DAY_RE = /^((?:Mon|Tue|Wed|Thur|Thu|Fri|Sat|Sun)\w*\.?|(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\.?$/i; + +/** + * Parse a weekday mass line that may have day prefixes. + * Algorithm: split by comma, process each token; track current days across tokens. + */ +export function parseWeekdayLine(line: string): ParsedSchedule[] { + let remainder = line.trim(); + let language = 'English'; + + const langMatch = remainder.match(/\(?(Cantonese|English|Tagalog|Chinese)\)?/i); + if (langMatch) { + const raw = langMatch[1].toLowerCase(); + language = raw === 'cantonese' || raw === 'chinese' ? 'Cantonese' + : raw === 'tagalog' ? 'Tagalog' : 'English'; + remainder = remainder.replace(langMatch[0], '').replace(/\s*\(\s*$/, '').trim(); + } + + const results: ParsedSchedule[] = []; + const tokens = remainder.split(',').map(t => t.trim()).filter(Boolean); + let currentDays: number[] = [1, 2, 3, 4, 5]; // default Mon–Fri + let accumulatedDays: number[] = []; // day-only tokens accumulate here until a time appears + + for (const token of tokens) { + const prefixMatch = token.match(DAY_PREFIX_RE); + if (prefixMatch) { + const days = parseDays(prefixMatch[1]); + const timePart = token.slice(prefixMatch[0].length); + const time = normalizeTime(timePart); + if (time) { + // Merge any previously accumulated day-only tokens with this token's days + const mergedDays = accumulatedDays.length > 0 + ? [...new Set([...accumulatedDays, ...days])] + : days.length > 0 ? days : currentDays; + accumulatedDays = []; + if (mergedDays.length > 0) currentDays = mergedDays; + for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null }); + } else { + // Day-only token via prefix match: accumulate + if (days.length > 0) accumulatedDays.push(...days); + } + } else if (PURE_DAY_RE.test(token)) { + // Pure day token like "Mon." "Tue." "Tuesday" — accumulate + const days = parseDays(token); + if (days.length > 0) accumulatedDays.push(...days); + } else { + const time = normalizeTime(token); + if (time) { + // Apply any accumulated days, then reset + if (accumulatedDays.length > 0) { + currentDays = [...new Set(accumulatedDays)]; + accumulatedDays = []; + } + for (const day of currentDays) results.push({ dayOfWeek: day, time, language, notes: null }); + } + } + } + + return results; +}