Files
ScraperControl/scripts/import-weekdaymasses.ts

1122 lines
42 KiB
TypeScript
Raw Normal View History

#!/usr/bin/env tsx
/**
* Import Catholic churches and mass schedules from weekdaymasses.org.uk
*
* weekdaymasses.org.uk covers ~4,000+ churches globally (GB, Ireland, and 49+
* international countries). All data is served on single HTML pages per area.
*
* Import strategy:
* 1. Fetch area pages (gb, ireland, outside-gb)
* 2. Parse `.church` divs for name, coordinates, address, phone, website, mass times
* 3. Convert mass times from H.MMam/pm to HH:MM 24h format
* 4. Detect country from address patterns (for outside-gb)
* 5. Match against existing churches, upsert with mass schedules
*
* Usage:
* npx tsx scripts/import-weekdaymasses.ts --all
* npx tsx scripts/import-weekdaymasses.ts --area gb
* npx tsx scripts/import-weekdaymasses.ts --area outside-gb --dry-run
* npx tsx scripts/import-weekdaymasses.ts --all --resume-from 500
* npx tsx scripts/import-weekdaymasses.ts --all --job-id {uuid}
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
connectionString: dbUrl,
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';
// ─── Constants ───────────────────────────────────────────────────────────────
const SITE_BASE = 'https://weekdaymasses.org.uk';
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const AREA_PAGES: Record<string, { url: string; defaultCountry: string }> = {
'gb': { url: '/en/area/gb/churches', defaultCountry: 'GB' },
'ireland': { url: '/en/area/ireland/churches', defaultCountry: 'IE' },
'outside-gb': { url: '/en/area/outside-gb/churches', defaultCountry: '' }, // needs detection
};
// Known languages that may appear in parentheses after mass times
const KNOWN_LANGUAGES = new Set([
'english', 'tamil', 'sinhala', 'sinhalese', 'french', 'spanish', 'portuguese',
'polish', 'italian', 'german', 'latin', 'korean', 'japanese', 'chinese',
'mandarin', 'cantonese', 'tagalog', 'filipino', 'hindi', 'malayalam',
'konkani', 'telugu', 'kannada', 'marathi', 'bengali', 'urdu', 'arabic',
'vietnamese', 'indonesian', 'malay', 'dutch', 'hungarian', 'czech', 'slovak',
'slovenian', 'croatian', 'swahili', 'igbo', 'yoruba', 'ga', 'twi', 'ewe',
'shona', 'zulu', 'sesotho', 'afrikaans',
]);
// Country name patterns (matched anywhere in address, no $ anchor — addresses have trailing \r\n)
const COUNTRY_NAME_MAP: Record<string, string> = {
'india': 'IN', 'sri lanka': 'LK', 'france': 'FR', 'italy': 'IT', 'spain': 'ES',
'portugal': 'PT', 'germany': 'DE', 'south korea': 'KR', 'korea': 'KR', 'japan': 'JP',
'philippines': 'PH', 'singapore': 'SG', 'malaysia': 'MY', 'hong kong': 'HK',
'thailand': 'TH', 'indonesia': 'ID', 'vietnam': 'VN', 'pakistan': 'PK',
'bangladesh': 'BD', 'nepal': 'NP', 'myanmar': 'MM', 'nigeria': 'NG', 'ghana': 'GH',
'kenya': 'KE', 'tanzania': 'TZ', 'uganda': 'UG', 'south africa': 'ZA',
'australia': 'AU', 'new zealand': 'NZ', 'canada': 'CA', 'belgium': 'BE',
'netherlands': 'NL', 'luxembourg': 'LU', 'switzerland': 'CH', 'austria': 'AT',
'poland': 'PL', 'hungary': 'HU', 'czech republic': 'CZ', 'czechia': 'CZ',
'mexico': 'MX', 'brazil': 'BR', 'argentina': 'AR', 'colombia': 'CO', 'peru': 'PE',
'chile': 'CL', 'china': 'CN', 'taiwan': 'TW', 'ireland': 'IE', 'malta': 'MT',
'cyprus': 'CY', 'croatia': 'HR', 'slovenia': 'SI', 'romania': 'RO', 'slovakia': 'SK',
'senegal': 'SN', 'grenada': 'GD', 'greece': 'GR', 'denmark': 'DK', 'sweden': 'SE',
'norway': 'NO', 'finland': 'FI', 'iceland': 'IS', 'latvia': 'LV', 'lithuania': 'LT',
'estonia': 'EE', 'ukraine': 'UA', 'russia': 'RU', 'georgia': 'GE', 'armenia': 'AM',
'jordan': 'JO', 'lebanon': 'LB', 'israel': 'IL', 'turkey': 'TR', 'egypt': 'EG',
'morocco': 'MA', 'tunisia': 'TN', 'cameroon': 'CM', 'ethiopia': 'ET',
'madagascar': 'MG', 'mozambique': 'MZ', 'zambia': 'ZM', 'zimbabwe': 'ZW',
'trinidad': 'TT', 'trinidad and tobago': 'TT', 'jamaica': 'JM', 'barbados': 'BB',
'bahamas': 'BS', 'bermuda': 'BM', 'costa rica': 'CR', 'panama': 'PA',
'guatemala': 'GT', 'honduras': 'HN', 'el salvador': 'SV', 'nicaragua': 'NI',
'ecuador': 'EC', 'venezuela': 'VE', 'bolivia': 'BO', 'paraguay': 'PY', 'uruguay': 'UY',
'puerto rico': 'PR', 'fiji': 'FJ', 'samoa': 'WS', 'tonga': 'TO', 'guam': 'GU',
'liechtenstein': 'LI', 'monaco': 'MC', 'andorra': 'AD', 'san marino': 'SM',
'serbia': 'RS', 'bosnia': 'BA', 'montenegro': 'ME', 'north macedonia': 'MK',
'albania': 'AL', 'kosovo': 'XK', 'bulgaria': 'BG', 'moldova': 'MD', 'belarus': 'BY',
'kazakhstan': 'KZ', 'uzbekistan': 'UZ', 'kyrgyzstan': 'KG', 'tajikistan': 'TJ',
'cambodia': 'KH', 'laos': 'LA', 'brunei': 'BN', 'east timor': 'TL', 'timor-leste': 'TL',
'papua new guinea': 'PG', 'mongolia': 'MN',
'curaçao': 'CW', 'curacao': 'CW', 'cape verde': 'CV', 'cabo verde': 'CV',
'the gambia': 'GM', 'gambia': 'GM',
'congo': 'CD', 'ivory coast': 'CI', "côte d'ivoire": 'CI', 'burkina faso': 'BF',
'suriname': 'SR', 'guyana': 'GY', 'belize': 'BZ', 'haiti': 'HT',
'dominican republic': 'DO', 'cuba': 'CU', 'qatar': 'QA',
'united arab emirates': 'AE', 'u.a.e.': 'AE', 'uae': 'AE', 'dubai': 'AE', 'abu dhabi': 'AE',
'saudi arabia': 'SA', 'bahrain': 'BH', 'kuwait': 'KW', 'oman': 'OM',
'antigua and barbuda': 'AG', 'antigua': 'AG',
'mauritius': 'MU', 'réunion': 'RE', 'reunion': 'RE', 'seychelles': 'SC',
'saint lucia': 'LC', 'st. lucia': 'LC', 'dominica': 'DM',
'saint vincent': 'VC', 'st. vincent': 'VC',
'saint kitts': 'KN', 'st. kitts': 'KN',
'u.s. virgin islands': 'VI', 'us virgin islands': 'VI', 'saint croix': 'VI',
'saint thomas': 'VI', 'virgin islands': 'VI',
'aruba': 'AW', 'bonaire': 'BQ', 'sint maarten': 'SX',
'iraq': 'IQ', 'iran': 'IR', 'afghanistan': 'AF',
'macao': 'MO', 'macau': 'MO',
};
// City/region-based detection fallback (for addresses without country names)
const CITY_COUNTRY_MAP: Record<string, string> = {
// Major cities that unambiguously identify a country
'jakarta': 'ID', 'surabaya': 'ID', 'bandung': 'ID', 'yogyakarta': 'ID',
'budapest': 'HU', 'berlin': 'DE', 'münchen': 'DE', 'munich': 'DE', 'hamburg': 'DE',
'köln': 'DE', 'frankfurt': 'DE', 'düsseldorf': 'DE', 'stuttgart': 'DE',
'paris': 'FR', 'lyon': 'FR', 'marseille': 'FR', 'toulouse': 'FR', 'lille': 'FR',
'nantes': 'FR', 'bordeaux': 'FR', 'strasbourg': 'FR', 'rennes': 'FR',
'roma': 'IT', 'rome': 'IT', 'milano': 'IT', 'milan': 'IT', 'napoli': 'IT',
'torino': 'IT', 'firenze': 'IT', 'florence': 'IT', 'bologna': 'IT', 'genova': 'IT',
'madrid': 'ES', 'barcelona': 'ES', 'valencia': 'ES', 'sevilla': 'ES', 'seville': 'ES',
'málaga': 'ES', 'bilbao': 'ES', 'mallorca': 'ES', 'tenerife': 'ES',
'lisboa': 'PT', 'lisbon': 'PT', 'porto': 'PT', 'faro': 'PT',
'warszawa': 'PL', 'warsaw': 'PL', 'kraków': 'PL', 'krakow': 'PL',
'praha': 'CZ', 'prague': 'CZ', 'brno': 'CZ',
'wien': 'AT', 'vienna': 'AT', 'innsbruck': 'AT', 'salzburg': 'AT', 'graz': 'AT',
'zürich': 'CH', 'zurich': 'CH', 'genève': 'CH', 'geneva': 'CH', 'bern': 'CH', 'basel': 'CH',
'amsterdam': 'NL', 'rotterdam': 'NL', 'den haag': 'NL',
'bruxelles': 'BE', 'brussels': 'BE', 'brugge': 'BE', 'antwerpen': 'BE',
'københavn': 'DK', 'copenhagen': 'DK', 'aarhus': 'DK', 'aalborg': 'DK',
'stockholm': 'SE', 'göteborg': 'SE', 'malmö': 'SE',
'oslo': 'NO', 'bergen': 'NO',
'helsinki': 'FI',
'reykjavik': 'IS',
'riga': 'LV', 'vilnius': 'LT', 'tallinn': 'EE',
'kyiv': 'UA', 'київ': 'UA', 'lviv': 'UA',
'москва': 'RU', 'moscow': 'RU', 'санкт-петербург': 'RU', 'магадан': 'RU',
'калуга': 'RU', 'новосибирск': 'RU', 'владивосток': 'RU',
'tbilisi': 'GE', 'yerevan': 'AM',
'amman': 'JO', 'beirut': 'LB', 'istanbul': 'TR', 'ankara': 'TR',
'cairo': 'EG', 'casablanca': 'MA', 'tunis': 'TN',
'nairobi': 'KE', 'dar es salaam': 'TZ', 'kampala': 'UG', 'lagos': 'NG', 'accra': 'GH',
'johannesburg': 'ZA', 'cape town': 'ZA', 'durban': 'ZA', 'pretoria': 'ZA',
'seoul': 'KR', 'busan': 'KR', 'tokyo': 'JP', 'osaka': 'JP', 'yokohama': 'JP',
'nagasaki': 'JP', 'kyoto': 'JP', 'beijing': 'CN', 'shanghai': 'CN',
'taipei': 'TW', 'mumbai': 'IN', 'chennai': 'IN', 'kolkata': 'IN', 'delhi': 'IN',
'new delhi': 'IN', 'bangalore': 'IN', 'bengaluru': 'IN', 'hyderabad': 'IN', 'goa': 'IN',
'colombo': 'LK', 'matara': 'LK', 'kandy': 'LK', 'galle': 'LK',
'kuala lumpur': 'MY', 'penang': 'MY',
'manila': 'PH', 'cebu': 'PH',
'bangkok': 'TH', 'chiang mai': 'TH',
'hà nội': 'VN', 'hanoi': 'VN', 'ho chi minh': 'VN', 'saigon': 'VN',
'phnom penh': 'KH', 'vientiane': 'LA',
'sydney': 'AU', 'melbourne': 'AU', 'brisbane': 'AU', 'perth': 'AU', 'adelaide': 'AU',
'auckland': 'NZ', 'wellington': 'NZ', 'christchurch': 'NZ',
'toronto': 'CA', 'vancouver': 'CA', 'montreal': 'CA', 'ottawa': 'CA',
'mexico city': 'MX', 'guadalajara': 'MX', 'monterrey': 'MX',
'são paulo': 'BR', 'rio de janeiro': 'BR', 'brasília': 'BR',
'buenos aires': 'AR', 'bogotá': 'CO', 'lima': 'PE', 'santiago': 'CL',
'vaduz': 'LI', 'monaco': 'MC',
'valletta': 'MT', 'nicosia': 'CY', 'zagreb': 'HR', 'ljubljana': 'SI',
'bratislava': 'SK', 'bucharest': 'RO', 'sofia': 'BG', 'belgrade': 'RS',
'nadi': 'FJ', 'suva': 'FJ',
'san juan': 'PR', 'viejo san juan': 'PR',
// Cities missed in the first pass
'calais': 'FR', 'lourdes': 'FR', 'nice': 'FR', 'montpellier': 'FR', 'toulon': 'FR',
'abidjan': 'CI', 'douala': 'CM', 'yaoundé': 'CM', 'kinshasa': 'CD', 'lusaka': 'ZM',
'harare': 'ZW', 'maputo': 'MZ', 'antananarivo': 'MG', 'dakar': 'SN',
'pademangan': 'ID', 'jakarta utara': 'ID', 'denpasar': 'ID', 'semarang': 'ID',
'makassar': 'ID', 'medan': 'ID', 'bogor': 'ID', 'malang': 'ID', 'palembang': 'ID',
'서울': 'KR', '부산': 'KR', // Seoul, Busan in Korean
// Japanese city names in kanji
'東京': 'JP', '大阪': 'JP', '横浜': 'JP', '名古屋': 'JP', '長崎': 'JP',
'京都': 'JP', '神戸': 'JP', '福岡': 'JP', '札幌': 'JP', '仙台': 'JP', '広島': 'JP',
// Chinese city names in hanzi
'北京': 'CN', '上海': 'CN', '深圳': 'CN', '广州': 'CN', '香港': 'HK',
// More missing cities
'kuching': 'MY', 'kota kinabalu': 'MY', 'ipoh': 'MY', 'johor bahru': 'MY', 'sarawak': 'MY',
'trondheim': 'NO', 'stavanger': 'NO', 'tromsø': 'NO',
'taastrup': 'DK', 'odense': 'DK',
'cancún': 'MX', 'playa del carmen': 'MX', 'mérida': 'MX', 'puebla': 'MX', 'cancun': 'MX',
'addis ababa': 'ET',
'la paz': 'BO', 'cochabamba': 'BO', 'santa cruz': 'BO',
'willemstad': 'CW', 'curaçao': 'CW', 'curacao': 'CW',
'port of spain': 'TT', 'bridgetown': 'BB', 'nassau': 'BS',
'phnom penh': 'KH', 'siem reap': 'KH',
'port moresby': 'PG',
'ulaanbaatar': 'MN',
'praia': 'CV', 'cape verde': 'CV',
'celebration': 'US', // Celebration, Florida — city not great, but helps
'the gambia': 'GM', 'gambia': 'GM', 'banjul': 'GM',
'playa blanca': 'ES', 'gran canaria': 'ES', 'fuerteventura': 'ES', 'lanzarote': 'ES',
'tirana': 'AL', 'durrës': 'AL',
'podgorica': 'ME', 'budva': 'ME',
'skopje': 'MK', 'pristina': 'XK', 'sarajevo': 'BA',
'minsk': 'BY', 'chișinău': 'MD', 'chisinau': 'MD',
'bishkek': 'KG', 'dushanbe': 'TJ', 'tashkent': 'UZ', 'almaty': 'KZ', 'astana': 'KZ',
'lekki': 'NG', 'abuja': 'NG', 'enugu': 'NG', 'yaba': 'NG', 'ikeja': 'NG',
// Serbian
'beograd': 'RS', 'novi sad': 'RS',
// Thai
'phuket': 'TH', 'pattaya': 'TH', 'hua hin': 'TH',
// Spanish cities
'alicante': 'ES', 'zaragoza': 'ES', 'murcia': 'ES', 'palma': 'ES',
'granada': 'ES', 'córdoba': 'ES', 'santander': 'ES', 'cádiz': 'ES',
'san sebastián': 'ES', 'las palmas': 'ES', 'santa cruz de tenerife': 'ES',
// Belgian
'woluwe': 'BE', 'ixelles': 'BE', 'schaerbeek': 'BE', 'liège': 'BE', 'namur': 'BE',
// Portuguese
'loulé': 'PT', 'albufeira': 'PT', 'coimbra': 'PT', 'braga': 'PT', 'funchal': 'PT',
// Turkish
'mersin': 'TR', 'izmir': 'TR', 'antalya': 'TR', 'trabzon': 'TR',
// Lebanese (French spelling)
'beyrouth': 'LB',
// Burkina Faso
'ouagadougou': 'BF', 'bobo-dioulasso': 'BF',
// Greek
'heraklion': 'GR', 'ηράκλειο': 'GR', 'μυτιλήνη': 'GR', 'αθήνα': 'GR',
'athens': 'GR', 'thessaloniki': 'GR', 'patras': 'GR',
// Bulgarian (transliterated)
'plovdiv': 'BG', 'пловдив': 'BG', 'варна': 'BG',
// Vietnamese with diacritics
'sài gòn': 'VN', 'hồ chí minh': 'VN', 'đà nẵng': 'VN',
// Moldovan
'chişinău': 'MD',
// Hungarian
'ferenciek': 'HU', 'debrecen': 'HU', 'szeged': 'HU', 'pécs': 'HU',
// Polish cities
'kalisz': 'PL', 'gdańsk': 'PL', 'wrocław': 'PL', 'poznań': 'PL', 'łódź': 'PL',
'katowice': 'PL', 'lublin': 'PL', 'szczecin': 'PL',
// Bermuda
'warwick': 'BM',
// Maltese
'sliema': 'MT', 'valletta': 'MT',
};
// Postal code / state code patterns
const POSTAL_PATTERNS: Array<{ pattern: RegExp; country: string }> = [
{ pattern: /\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b/, country: 'GB' }, // UK postcode
{ pattern: /\b[A-Z]\d{2}\s*[A-Z0-9]{4}\b/, country: 'IE' }, // Irish Eircode
{ pattern: /\bCittà del Vaticano\b/i, country: 'VA' },
{ pattern: /\b\d{3}\s*\d{3}\b/, country: 'IN' }, // Indian 6-digit with optional space
{ pattern: /\bNSW\s+\d{4}\b/, country: 'AU' }, // Australian state codes
{ pattern: /\bVIC\s+\d{4}\b/, country: 'AU' },
{ pattern: /\bQLD\s+\d{4}\b/, country: 'AU' },
{ pattern: /\bSA\s+\d{4}\b/, country: 'AU' },
{ pattern: /\bWA\s+\d{4}\b/, country: 'AU' },
{ pattern: /\bTAS\s+\d{4}\b/, country: 'AU' },
{ pattern: /\bACT\s+\d{4}\b/, country: 'AU' },
{ pattern: /\bNT\s+\d{4}\b/, country: 'AU' },
{ pattern: /\bA-\d{4}\b/, country: 'AT' }, // Austrian postal prefix
{ pattern: /\b, PR,?\s*\d{5}\b/, country: 'PR' }, // Puerto Rico
{ pattern: /\b\d{2}-\d{3}\b/, country: 'PL' }, // Polish postal code (XX-XXX)
// US state abbreviation + ZIP (e.g., "NY, 11201" or "NY 11201")
{ pattern: /\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY)[,\s]+\d{5}\b/, country: 'US' },
];
// ─── Types ───────────────────────────────────────────────────────────────────
interface ParsedChurch {
churchId: string; // weekdaymasses numeric ID
name: string;
latitude: number;
longitude: number;
address: string | null;
phone: string | null;
website: string | null;
country: string;
schedules: ParsedSchedule[];
}
interface ParsedSchedule {
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
time: string; // "07:00", "18:30"
language: string;
notes: string | null;
}
interface ImportStats {
churchesParsed: number;
churchesMatched: number;
churchesCreated: number;
churchesSkipped: number;
massSchedulesCreated: number;
errors: number;
}
interface CLIArgs {
all: boolean;
area?: string;
dryRun: boolean;
resumeFrom: number;
jobId?: string;
}
// ─── CLI ─────────────────────────────────────────────────────────────────────
function parseArgs(): CLIArgs {
const args = process.argv.slice(2);
const result: CLIArgs = { all: false, dryRun: false, resumeFrom: 0 };
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--all': result.all = true; break;
case '--area': result.area = args[++i]; break;
case '--dry-run': result.dryRun = true; break;
case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break;
case '--job-id': result.jobId = args[++i]; break;
case '--help':
console.log(`Usage: npx tsx scripts/import-weekdaymasses.ts [options]
--all Import all 3 area pages (gb, ireland, outside-gb)
--area <name> Import specific area (gb, ireland, outside-gb)
--dry-run No database writes
--resume-from <n> Skip first N churches
--job-id <uuid> Background job tracking`);
process.exit(0);
}
}
if (!result.all && !result.area) {
console.error('Error: specify --all or --area <name>');
process.exit(1);
}
return result;
}
// ─── HTTP ────────────────────────────────────────────────────────────────────
async function fetchPage(url: string): Promise<string | null> {
try {
const response = await fetch(url, {
headers: {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
});
if (!response.ok) {
console.error(` HTTP ${response.status} for ${url}`);
return null;
}
return await response.text();
} catch (error) {
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
return null;
}
}
// ─── HTML Parsing ────────────────────────────────────────────────────────────
/**
* Extract all church divs from an area page HTML.
* Each church is a `<div class="church" id="pNNNNN">...</div>`.
*/
function extractChurchBlocks(html: string): string[] {
const blocks: string[] = [];
const regex = /<div\s+class="church"\s+id="p(\d+)">/g;
let match: RegExpExecArray | null;
while ((match = regex.exec(html)) !== null) {
const start = match.index;
// Find the closing </div> at the right nesting level
let depth = 1;
let pos = start + match[0].length;
while (depth > 0 && pos < html.length) {
const nextOpen = html.indexOf('<div', pos);
const nextClose = html.indexOf('</div>', pos);
if (nextClose === -1) break;
if (nextOpen !== -1 && nextOpen < nextClose) {
depth++;
pos = nextOpen + 4;
} else {
depth--;
pos = nextClose + 6;
}
}
blocks.push(html.substring(start, pos));
}
return blocks;
}
/**
* Parse a single church block HTML into structured data.
*/
function parseChurchBlock(html: string, defaultCountry: string): ParsedChurch | null {
// Church ID from div id="pNNNNN"
const idMatch = html.match(/id="p(\d+)"/);
if (!idMatch) return null;
const churchId = idMatch[1];
// Name from h3
const nameMatch = html.match(/<h3>(.*?)<\/h3>/s);
if (!nameMatch) return null;
const name = decodeHtmlEntities(nameMatch[1].trim());
// Coordinates from map link
let latitude = 0;
let longitude = 0;
const mapMatch = html.match(/lat=(-?[\d.]+)&(?:amp;)?lon=(-?[\d.]+)/);
if (mapMatch) {
latitude = parseFloat(mapMatch[1]);
longitude = parseFloat(mapMatch[2]);
}
// Address from p.address — text after the <br> tag
let address: string | null = null;
const addressMatch = html.match(/<p\s+class="address">([\s\S]*?)<\/p>/);
if (addressMatch) {
const addressHtml = addressMatch[1];
// Get text after last <br> (or after Streetview link)
const brIdx = addressHtml.lastIndexOf('<br');
if (brIdx !== -1) {
const afterBr = addressHtml.substring(brIdx);
const textAfterTag = afterBr.replace(/<br\s*\/?>/, '').trim();
address = stripHtmlTags(textAfterTag).trim() || null;
}
}
// Phone from p.telephone
let phone: string | null = null;
const phoneMatch = html.match(/<p\s+class="telephone">[\s\S]*?Tel:<\/span>\s*(.*?)<\/p>/);
if (phoneMatch) {
phone = phoneMatch[1].trim() || null;
}
// Website from p.transport with "Link to church website:"
let website: string | null = null;
const websiteMatch = html.match(/Link to church website:<\/span>\s*<a[^>]+href="([^"]+)"/);
if (websiteMatch) {
website = websiteMatch[1];
}
// Country detection
let country = defaultCountry;
if (!country && address) {
country = detectCountry(address);
}
if (!country) country = 'XX'; // Unknown
// Mass schedules from p.times
const schedules = parseScheduleBlocks(html);
return { churchId, name, latitude, longitude, address, phone, website, country, schedules };
}
/**
* Detect country from address text using three strategies:
* 1. Country name anywhere in address
* 2. City/region name matching
* 3. Postal code / state code patterns
*/
function detectCountry(address: string): string {
// Clean address for matching
const cleaned = address.replace(/\r?\n/g, ' ').trim();
const lower = cleaned.toLowerCase();
// 1. Country name match (check longer names first to avoid "India" matching in "Indiana")
const sortedCountries = Object.entries(COUNTRY_NAME_MAP)
.sort((a, b) => b[0].length - a[0].length);
for (const [name, code] of sortedCountries) {
if (lower.includes(name)) return code;
}
// 2. City/region match
for (const [city, code] of Object.entries(CITY_COUNTRY_MAP)) {
// Use word boundary-like check to avoid partial matches
const idx = lower.indexOf(city);
if (idx !== -1) {
const before = idx > 0 ? lower[idx - 1] : ' ';
const after = idx + city.length < lower.length ? lower[idx + city.length] : ' ';
if (/[\s,.(]/.test(before) || idx === 0) {
if (/[\s,.):\r\n]/.test(after) || idx + city.length === lower.length) {
return code;
}
}
}
}
// 3. US state detection — match "State, ZIPCODE" or "State ZIPCODE" patterns
// to avoid false positives from country names like "Georgia"
const US_STATES = [
'alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado',
'connecticut', 'delaware', 'florida', 'hawaii', 'idaho',
'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana',
'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota',
'mississippi', 'missouri', 'montana', 'nebraska', 'nevada',
'new hampshire', 'new jersey', 'new mexico', 'new york', 'north carolina',
'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania',
'rhode island', 'south carolina', 'south dakota', 'tennessee', 'texas',
'utah', 'vermont', 'virginia', 'washington', 'west virginia',
'wisconsin', 'wyoming', 'georgia', // Georgia last — also a country, but with ZIP it's US
];
for (const state of US_STATES) {
const idx = lower.indexOf(state);
if (idx !== -1) {
// Check if followed by comma+ZIP or just ZIP (US address pattern)
const after = cleaned.substring(idx + state.length);
if (/^[,\s]+\d{5}/.test(after)) return 'US';
// Also match state abbreviation patterns like "FL 34747" — 2-letter code + ZIP
if (state !== 'georgia' && /^[,\s]*$/.test(after)) return 'US';
}
}
// 4. Postal code patterns
for (const { pattern, country } of POSTAL_PATTERNS) {
if (pattern.test(cleaned)) return country;
}
return '';
}
/**
* Parse all <p class="times"> blocks into schedule entries.
*/
function parseScheduleBlocks(html: string): ParsedSchedule[] {
const schedules: ParsedSchedule[] = [];
const timesRegex = /<p\s+class="times">([\s\S]*?)<\/p>/g;
let match: RegExpExecArray | null;
while ((match = timesRegex.exec(html)) !== null) {
const text = stripHtmlTags(match[1])
.replace(/\s+/g, ' ')
.trim();
const parsed = parseTimesLine(text);
schedules.push(...parsed);
}
return schedules;
}
/**
* Parse a single mass times line like:
* "Sunday: 6.30am(Tamil), 8.30am(Tamil), 5.30pm(English)"
* "Weekday: Monday, Tuesday, Wednesday 6.15am"
* "Mon Tue Wed Thu Fri: 6.30am(Tamil)"
*/
function parseTimesLine(text: string): ParsedSchedule[] {
const schedules: ParsedSchedule[] = [];
// Split on colon — left side is days, right side is times
const colonIdx = text.indexOf(':');
if (colonIdx === -1) return schedules;
const dayPart = text.substring(0, colonIdx).trim();
const timePart = text.substring(colonIdx + 1).trim();
// Parse default day numbers from the day label
const defaultDays = parseDayLabel(dayPart);
// Split on semicolons to handle "Monday 10.00am; Thursday 7.30pm" patterns
const parts = timePart.split(';').map(p => p.trim()).filter(Boolean);
for (const part of parts) {
// Check if specific day names appear at the start of this part
const { specificDays, cleanedTimePart } = extractSpecificDays(part);
const days = specificDays.length > 0 ? specificDays : defaultDays;
if (days.length === 0) continue;
const timeStr = specificDays.length > 0 ? cleanedTimePart : part;
// Extract individual time entries: "7.00am(Tamil), 8.30am(English), ..."
const timeEntries = extractTimeEntries(timeStr);
for (const entry of timeEntries) {
const time24 = convertTo24h(entry.time);
if (!time24) continue;
for (const day of days) {
schedules.push({
dayOfWeek: day,
time: time24,
language: entry.language,
notes: entry.notes,
});
}
}
}
return schedules;
}
// Day name mappings
const DAY_MAP: Record<string, number> = {
'sunday': 0, 'sun': 0,
'monday': 1, 'mon': 1,
'tuesday': 2, 'tue': 2,
'wednesday': 3, 'wed': 3,
'thursday': 4, 'thu': 4,
'friday': 5, 'fri': 5,
'saturday': 6, 'sat': 6,
};
/**
* Parse day label (left of colon) into day numbers.
*/
function parseDayLabel(label: string): number[] {
const lower = label.toLowerCase().trim();
// "Weekday" = Mon-Fri
if (lower === 'weekday' || lower === 'weekdays') {
return [1, 2, 3, 4, 5];
}
// "Holy Day" or "Holyday"
if (lower.includes('holy day') || lower.includes('holyday')) {
return []; // Skip holy days — not a regular schedule
}
// Try to parse individual day names from the label
// e.g., "Mon Tue Wed Thu Fri" or "Monday Tuesday"
const days: number[] = [];
const words = lower.split(/[\s,]+/);
for (const word of words) {
const dayNum = DAY_MAP[word];
if (dayNum !== undefined) {
days.push(dayNum);
}
}
return days;
}
/**
* Check if the time part starts with specific day names.
* e.g., "Monday, Tuesday, Wednesday 6.15am" -> days=[1,2,3], cleaned="6.15am"
*/
function extractSpecificDays(timePart: string): { specificDays: number[]; cleanedTimePart: string } {
const days: number[] = [];
let remaining = timePart;
// Match day names at the start, separated by commas/spaces
const dayPattern = /^((?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)[,\s]*)+)/i;
const match = remaining.match(dayPattern);
if (match) {
const dayStr = match[1];
const words = dayStr.split(/[\s,]+/).filter(Boolean);
let allAreDays = true;
for (const word of words) {
const lower = word.toLowerCase();
if (DAY_MAP[lower] !== undefined) {
days.push(DAY_MAP[lower]);
} else {
allAreDays = false;
break;
}
}
if (allAreDays && days.length > 0) {
remaining = remaining.substring(match[0].length).trim();
} else {
days.length = 0; // Reset if not all words were days
}
}
return { specificDays: days, cleanedTimePart: remaining };
}
interface TimeEntry {
time: string; // Raw time: "7.00am", "6.30pm"
language: string;
notes: string | null;
}
/**
* Extract time entries from a times string.
* e.g., "7.00am(Tamil), 8.30am(English), 12.00pm" -> [{time: "7.00am", language: "Tamil"}, ...]
*/
function extractTimeEntries(text: string): TimeEntry[] {
const entries: TimeEntry[] = [];
// Match time patterns: digits.digitsam/pm optionally followed by (annotation)
const pattern = /(\d{1,2}\.\d{2}\s*(?:am|pm))(?:\s*\(([^)]*)\))?/gi;
let match: RegExpExecArray | null;
while ((match = pattern.exec(text)) !== null) {
const rawTime = match[1].replace(/\s/g, '');
const annotation = match[2]?.trim() || null;
let language = 'English';
let notes: string | null = null;
if (annotation) {
if (KNOWN_LANGUAGES.has(annotation.toLowerCase())) {
language = annotation.charAt(0).toUpperCase() + annotation.slice(1).toLowerCase();
} else {
notes = annotation;
}
}
entries.push({ time: rawTime, language, notes });
}
return entries;
}
/**
* Convert time from "H.MMam/pm" format to "HH:MM" 24h format.
*/
function convertTo24h(time: string): string | null {
const match = time.match(/^(\d{1,2})\.(\d{2})(am|pm)$/i);
if (!match) return null;
let hours = parseInt(match[1], 10);
const mins = parseInt(match[2], 10);
const period = match[3].toLowerCase();
if (mins < 0 || mins > 59) return null;
if (period === 'am') {
if (hours === 12) hours = 0;
} else {
if (hours !== 12) hours += 12;
}
if (hours < 0 || hours > 23) return null;
return `${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`;
}
// ─── Helpers ────────────────────────────────────────────────────────────────
function stripHtmlTags(html: string): string {
return html.replace(/<[^>]+>/g, '');
}
function decodeHtmlEntities(text: string): string {
return text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#x27;/g, "'")
.replace(/&apos;/g, "'")
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)));
}
// ─── Job Management ─────────────────────────────────────────────────────────
async function createOrResumeJob(jobId?: string): Promise<string | null> {
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function completeJob(jobId: string | null, error?: string): Promise<void> {
if (!jobId) return;
try {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error: error || null,
completedAt: new Date(),
},
});
} catch (err) {
console.error(`Failed to update job ${jobId}:`, err);
}
}
async function updateJobProgress(jobId: string | null, stats: ImportStats, total: number, processed: number): Promise<void> {
if (!jobId) return;
try {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
totalItems: total,
processed,
succeeded: stats.churchesMatched + stats.churchesCreated,
failed: stats.errors,
itemsFound: stats.churchesParsed,
},
});
} catch (err) {
console.error(`Failed to update job progress:`, err);
}
}
// ─── Database ───────────────────────────────────────────────────────────────
async function loadExistingChurches(): Promise<ExistingChurch[]> {
const churches = await prisma.church.findMany({
where: {
OR: [
{ weekdayMassesId: { not: null } },
{ country: { in: ['GB', 'IE', 'IN', 'LK', 'FR', 'IT', 'VA', 'PT', 'ES', 'KR', 'JP', 'PH', 'SG', 'MY', 'HK'] } },
],
},
select: {
id: true,
name: true,
latitude: true,
longitude: true,
osmId: true,
baiduId: true,
masstimesId: true,
orarimesseId: true,
massSchedulesPhId: true,
philmassId: true,
horariosMisasId: true,
mszeInfoId: true,
weekdayMassesId: true,
messesInfoId: true,
bohosluzbyId: true,
miserendId: true,
kerknetId: true,
gottesdienstzeitenId: true,
discovermassId: true,
source: true,
website: true,
phone: true,
address: true,
country: true,
},
});
return churches;
}
// ─── Main Import ────────────────────────────────────────────────────────────
async function importAreaBlocks(
areaName: string,
config: { url: string; defaultCountry: string },
blocks: string[],
existingChurches: ExistingChurch[],
stats: ImportStats,
dryRun: boolean,
resumeFrom: number,
jobId: string | null,
globalProcessed: number,
globalTotal: number,
): Promise<number> {
console.log(`\nProcessing ${areaName}: ${blocks.length} churches`);
const startTime = Date.now();
for (let i = 0; i < blocks.length; i++) {
const absoluteIndex = globalProcessed + i;
if (absoluteIndex < resumeFrom) continue;
const church = parseChurchBlock(blocks[i], config.defaultCountry);
if (!church) {
stats.errors++;
continue;
}
stats.churchesParsed++;
if (dryRun) {
if (stats.churchesParsed <= 20) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
console.log(` [${areaName}] ${church.name} (${church.country}) — ${church.schedules.length} schedules, coords: ${church.latitude.toFixed(4)}, ${church.longitude.toFixed(4)} [${elapsed}s]`);
}
continue;
}
try {
const candidate = {
name: church.name,
lat: church.latitude,
lng: church.longitude,
weekdayMassesId: church.churchId,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (duplicate) {
// Update existing church
const updateData: Record<string, unknown> = {
weekdayMassesId: church.churchId,
lastScrapedAt: new Date(),
};
// Only fill in missing fields
if (!duplicate.phone && church.phone) updateData.phone = church.phone;
if (!duplicate.website && church.website) {
updateData.website = church.website;
updateData.hasWebsite = true;
}
if (!duplicate.address && church.address) updateData.address = church.address;
// Update country if existing is unknown (XX) and we detected a real one
if (duplicate.country === 'XX' && church.country !== 'XX') {
updateData.country = church.country;
}
try {
await prisma.church.update({
where: { id: duplicate.id },
data: updateData,
});
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
// Replace mass schedules if we have new ones
if (church.schedules.length > 0) {
await prisma.$transaction(async (tx) => {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: church.schedules.map((s) => ({
churchId: duplicate.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: s.language,
notes: s.notes,
})),
});
});
stats.massSchedulesCreated += church.schedules.length;
}
stats.churchesMatched++;
} else {
// Create new church
try {
const newChurch = await prisma.church.create({
data: {
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
address: church.address,
country: church.country,
phone: church.phone,
website: church.website,
hasWebsite: !!church.website,
weekdayMassesId: church.churchId,
source: 'weekdaymasses',
lastScrapedAt: church.schedules.length > 0 ? new Date() : null,
},
});
// Create mass schedules
if (church.schedules.length > 0) {
await prisma.massSchedule.createMany({
data: church.schedules.map((s) => ({
churchId: newChurch.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
language: s.language,
notes: s.notes,
})),
});
stats.massSchedulesCreated += church.schedules.length;
}
// Add to in-memory dedup list
existingChurches.push({
id: newChurch.id,
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
osmId: null,
baiduId: null,
masstimesId: null,
orarimesseId: null,
massSchedulesPhId: null,
philmassId: null,
horariosMisasId: null,
mszeInfoId: null,
weekdayMassesId: church.churchId,
messesInfoId: null,
bohosluzbyId: null,
miserendId: null,
kerknetId: null,
gottesdienstzeitenId: null,
discovermassId: null,
source: 'weekdaymasses',
website: church.website,
phone: church.phone,
address: church.address,
country: church.country,
});
stats.churchesCreated++;
} catch (error) {
if (error instanceof Error && error.message.includes('Unique constraint')) {
stats.churchesSkipped++;
continue;
}
throw error;
}
}
} catch (error) {
console.error(` Error processing ${church.name} (${church.churchId}): ${error instanceof Error ? error.message : error}`);
stats.errors++;
}
// Progress logging
const totalProcessed = absoluteIndex + 1;
if (totalProcessed % 500 === 0) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
console.log(` Progress: ${totalProcessed}/${globalTotal} [${elapsed}s]`);
await updateJobProgress(jobId, stats, globalTotal, totalProcessed);
}
}
return globalProcessed + blocks.length;
}
// ─── Main ───────────────────────────────────────────────────────────────────
async function main() {
const args = parseArgs();
const jobId = await createOrResumeJob(args.jobId);
console.log(`\n${'='.repeat(70)}`);
console.log('WEEKDAYMASSES.ORG.UK IMPORTER');
console.log('='.repeat(70));
console.log(`Mode: ${args.all ? 'All areas' : `Area: ${args.area}`}`);
console.log(`Dry run: ${args.dryRun ? 'YES' : 'NO'}`);
if (args.resumeFrom > 0) console.log(`Resume from: ${args.resumeFrom}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('='.repeat(70));
const stats: ImportStats = {
churchesParsed: 0,
churchesMatched: 0,
churchesCreated: 0,
churchesSkipped: 0,
massSchedulesCreated: 0,
errors: 0,
};
// Determine which areas to import
const areas: Array<[string, { url: string; defaultCountry: string }]> = [];
if (args.all) {
areas.push(...Object.entries(AREA_PAGES));
} else if (args.area) {
const config = AREA_PAGES[args.area];
if (!config) {
console.error(`Unknown area: ${args.area}. Valid: ${Object.keys(AREA_PAGES).join(', ')}`);
process.exit(1);
}
areas.push([args.area, config]);
}
// Load existing churches for deduplication
if (!args.dryRun) {
console.log('\nLoading existing churches for deduplication...');
}
const existingChurches = args.dryRun ? [] : await loadExistingChurches();
if (!args.dryRun) {
console.log(`Loaded ${existingChurches.length} existing churches`);
}
// Pre-fetch all area pages to get accurate total count for progress tracking
console.log('\nFetching area pages...');
const fetchedAreas: Array<{ name: string; config: { url: string; defaultCountry: string }; blocks: string[] }> = [];
let globalTotal = 0;
for (const [areaName, config] of areas) {
console.log(` Fetching ${areaName}: ${SITE_BASE}${config.url}`);
const html = await fetchPage(`${SITE_BASE}${config.url}`);
if (!html) {
console.error(` Failed to fetch ${areaName} page`);
continue;
}
console.log(` Page size: ${(html.length / 1024 / 1024).toFixed(1)} MB`);
const blocks = extractChurchBlocks(html);
console.log(` Found ${blocks.length} church blocks`);
globalTotal += blocks.length;
fetchedAreas.push({ name: areaName, config, blocks });
}
console.log(`\nTotal churches across all areas: ${globalTotal}`);
let globalProcessed = 0;
try {
for (const { name: areaName, config, blocks } of fetchedAreas) {
globalProcessed = await importAreaBlocks(
areaName, config, blocks, existingChurches, stats,
args.dryRun, args.resumeFrom, jobId,
globalProcessed, globalTotal,
);
}
// Print summary
console.log(`\n${'='.repeat(70)}`);
console.log(`WEEKDAYMASSES IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
console.log('='.repeat(70));
console.log(`Churches parsed: ${stats.churchesParsed}`);
if (!args.dryRun) {
console.log(`Churches matched: ${stats.churchesMatched}`);
console.log(`Churches created: ${stats.churchesCreated}`);
console.log(`Churches skipped: ${stats.churchesSkipped} (duplicates)`);
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
}
if (stats.errors > 0) {
console.log(`Errors: ${stats.errors}`);
}
console.log('='.repeat(70));
await completeJob(jobId);
} catch (error) {
console.error('Fatal error:', error);
await completeJob(jobId, String(error));
process.exit(1);
} finally {
await prisma.$disconnect();
}
}
main();