#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from weekdaymasses.org.uk * * weekdaymasses.org.uk covers ~4,000+ churches globally (GB, Ireland, and 49+ * international countries). All data is served on single HTML pages per area. * * Import strategy: * 1. Fetch area pages (gb, ireland, outside-gb) * 2. Parse `.church` divs for name, coordinates, address, phone, website, mass times * 3. Convert mass times from H.MMam/pm to HH:MM 24h format * 4. Detect country from address patterns (for outside-gb) * 5. Match against existing churches, upsert with mass schedules * * Usage: * npx tsx scripts/import-weekdaymasses.ts --all * npx tsx scripts/import-weekdaymasses.ts --area gb * npx tsx scripts/import-weekdaymasses.ts --area outside-gb --dry-run * npx tsx scripts/import-weekdaymasses.ts --all --resume-from 500 * npx tsx scripts/import-weekdaymasses.ts --all --job-id {uuid} */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // ─── Constants ─────────────────────────────────────────────────────────────── const SITE_BASE = 'https://weekdaymasses.org.uk'; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const AREA_PAGES: Record = { 'gb': { url: '/en/area/gb/churches', defaultCountry: 'GB' }, 'ireland': { url: '/en/area/ireland/churches', defaultCountry: 'IE' }, 'outside-gb': { url: '/en/area/outside-gb/churches', defaultCountry: '' }, // needs detection }; // Known languages that may appear in parentheses after mass times const KNOWN_LANGUAGES = new Set([ 'english', 'tamil', 'sinhala', 'sinhalese', 'french', 'spanish', 'portuguese', 'polish', 'italian', 'german', 'latin', 'korean', 'japanese', 'chinese', 'mandarin', 'cantonese', 'tagalog', 'filipino', 'hindi', 'malayalam', 'konkani', 'telugu', 'kannada', 'marathi', 'bengali', 'urdu', 'arabic', 'vietnamese', 'indonesian', 'malay', 'dutch', 'hungarian', 'czech', 'slovak', 'slovenian', 'croatian', 'swahili', 'igbo', 'yoruba', 'ga', 'twi', 'ewe', 'shona', 'zulu', 'sesotho', 'afrikaans', ]); // Country name patterns (matched anywhere in address, no $ anchor — addresses have trailing \r\n) const COUNTRY_NAME_MAP: Record = { 'india': 'IN', 'sri lanka': 'LK', 'france': 'FR', 'italy': 'IT', 'spain': 'ES', 'portugal': 'PT', 'germany': 'DE', 'south korea': 'KR', 'korea': 'KR', 'japan': 'JP', 'philippines': 'PH', 'singapore': 'SG', 'malaysia': 'MY', 'hong kong': 'HK', 'thailand': 'TH', 'indonesia': 'ID', 'vietnam': 'VN', 'pakistan': 'PK', 'bangladesh': 'BD', 'nepal': 'NP', 'myanmar': 'MM', 'nigeria': 'NG', 'ghana': 'GH', 'kenya': 'KE', 'tanzania': 'TZ', 'uganda': 'UG', 'south africa': 'ZA', 'australia': 'AU', 'new zealand': 'NZ', 'canada': 'CA', 'belgium': 'BE', 'netherlands': 'NL', 'luxembourg': 'LU', 'switzerland': 'CH', 'austria': 'AT', 'poland': 'PL', 'hungary': 'HU', 'czech republic': 'CZ', 'czechia': 'CZ', 'mexico': 'MX', 'brazil': 'BR', 'argentina': 'AR', 'colombia': 'CO', 'peru': 'PE', 'chile': 'CL', 'china': 'CN', 'taiwan': 'TW', 'ireland': 'IE', 'malta': 'MT', 'cyprus': 'CY', 'croatia': 'HR', 'slovenia': 'SI', 'romania': 'RO', 'slovakia': 'SK', 'senegal': 'SN', 'grenada': 'GD', 'greece': 'GR', 'denmark': 'DK', 'sweden': 'SE', 'norway': 'NO', 'finland': 'FI', 'iceland': 'IS', 'latvia': 'LV', 'lithuania': 'LT', 'estonia': 'EE', 'ukraine': 'UA', 'russia': 'RU', 'georgia': 'GE', 'armenia': 'AM', 'jordan': 'JO', 'lebanon': 'LB', 'israel': 'IL', 'turkey': 'TR', 'egypt': 'EG', 'morocco': 'MA', 'tunisia': 'TN', 'cameroon': 'CM', 'ethiopia': 'ET', 'madagascar': 'MG', 'mozambique': 'MZ', 'zambia': 'ZM', 'zimbabwe': 'ZW', 'trinidad': 'TT', 'trinidad and tobago': 'TT', 'jamaica': 'JM', 'barbados': 'BB', 'bahamas': 'BS', 'bermuda': 'BM', 'costa rica': 'CR', 'panama': 'PA', 'guatemala': 'GT', 'honduras': 'HN', 'el salvador': 'SV', 'nicaragua': 'NI', 'ecuador': 'EC', 'venezuela': 'VE', 'bolivia': 'BO', 'paraguay': 'PY', 'uruguay': 'UY', 'puerto rico': 'PR', 'fiji': 'FJ', 'samoa': 'WS', 'tonga': 'TO', 'guam': 'GU', 'liechtenstein': 'LI', 'monaco': 'MC', 'andorra': 'AD', 'san marino': 'SM', 'serbia': 'RS', 'bosnia': 'BA', 'montenegro': 'ME', 'north macedonia': 'MK', 'albania': 'AL', 'kosovo': 'XK', 'bulgaria': 'BG', 'moldova': 'MD', 'belarus': 'BY', 'kazakhstan': 'KZ', 'uzbekistan': 'UZ', 'kyrgyzstan': 'KG', 'tajikistan': 'TJ', 'cambodia': 'KH', 'laos': 'LA', 'brunei': 'BN', 'east timor': 'TL', 'timor-leste': 'TL', 'papua new guinea': 'PG', 'mongolia': 'MN', 'curaçao': 'CW', 'curacao': 'CW', 'cape verde': 'CV', 'cabo verde': 'CV', 'the gambia': 'GM', 'gambia': 'GM', 'congo': 'CD', 'ivory coast': 'CI', "côte d'ivoire": 'CI', 'burkina faso': 'BF', 'suriname': 'SR', 'guyana': 'GY', 'belize': 'BZ', 'haiti': 'HT', 'dominican republic': 'DO', 'cuba': 'CU', 'qatar': 'QA', 'united arab emirates': 'AE', 'u.a.e.': 'AE', 'uae': 'AE', 'dubai': 'AE', 'abu dhabi': 'AE', 'saudi arabia': 'SA', 'bahrain': 'BH', 'kuwait': 'KW', 'oman': 'OM', 'antigua and barbuda': 'AG', 'antigua': 'AG', 'mauritius': 'MU', 'réunion': 'RE', 'reunion': 'RE', 'seychelles': 'SC', 'saint lucia': 'LC', 'st. lucia': 'LC', 'dominica': 'DM', 'saint vincent': 'VC', 'st. vincent': 'VC', 'saint kitts': 'KN', 'st. kitts': 'KN', 'u.s. virgin islands': 'VI', 'us virgin islands': 'VI', 'saint croix': 'VI', 'saint thomas': 'VI', 'virgin islands': 'VI', 'aruba': 'AW', 'bonaire': 'BQ', 'sint maarten': 'SX', 'iraq': 'IQ', 'iran': 'IR', 'afghanistan': 'AF', 'macao': 'MO', 'macau': 'MO', }; // City/region-based detection fallback (for addresses without country names) const CITY_COUNTRY_MAP: Record = { // Major cities that unambiguously identify a country 'jakarta': 'ID', 'surabaya': 'ID', 'bandung': 'ID', 'yogyakarta': 'ID', 'budapest': 'HU', 'berlin': 'DE', 'münchen': 'DE', 'munich': 'DE', 'hamburg': 'DE', 'köln': 'DE', 'frankfurt': 'DE', 'düsseldorf': 'DE', 'stuttgart': 'DE', 'paris': 'FR', 'lyon': 'FR', 'marseille': 'FR', 'toulouse': 'FR', 'lille': 'FR', 'nantes': 'FR', 'bordeaux': 'FR', 'strasbourg': 'FR', 'rennes': 'FR', 'roma': 'IT', 'rome': 'IT', 'milano': 'IT', 'milan': 'IT', 'napoli': 'IT', 'torino': 'IT', 'firenze': 'IT', 'florence': 'IT', 'bologna': 'IT', 'genova': 'IT', 'madrid': 'ES', 'barcelona': 'ES', 'valencia': 'ES', 'sevilla': 'ES', 'seville': 'ES', 'málaga': 'ES', 'bilbao': 'ES', 'mallorca': 'ES', 'tenerife': 'ES', 'lisboa': 'PT', 'lisbon': 'PT', 'porto': 'PT', 'faro': 'PT', 'warszawa': 'PL', 'warsaw': 'PL', 'kraków': 'PL', 'krakow': 'PL', 'praha': 'CZ', 'prague': 'CZ', 'brno': 'CZ', 'wien': 'AT', 'vienna': 'AT', 'innsbruck': 'AT', 'salzburg': 'AT', 'graz': 'AT', 'zürich': 'CH', 'zurich': 'CH', 'genève': 'CH', 'geneva': 'CH', 'bern': 'CH', 'basel': 'CH', 'amsterdam': 'NL', 'rotterdam': 'NL', 'den haag': 'NL', 'bruxelles': 'BE', 'brussels': 'BE', 'brugge': 'BE', 'antwerpen': 'BE', 'københavn': 'DK', 'copenhagen': 'DK', 'aarhus': 'DK', 'aalborg': 'DK', 'stockholm': 'SE', 'göteborg': 'SE', 'malmö': 'SE', 'oslo': 'NO', 'bergen': 'NO', 'helsinki': 'FI', 'reykjavik': 'IS', 'riga': 'LV', 'vilnius': 'LT', 'tallinn': 'EE', 'kyiv': 'UA', 'київ': 'UA', 'lviv': 'UA', 'москва': 'RU', 'moscow': 'RU', 'санкт-петербург': 'RU', 'магадан': 'RU', 'калуга': 'RU', 'новосибирск': 'RU', 'владивосток': 'RU', 'tbilisi': 'GE', 'yerevan': 'AM', 'amman': 'JO', 'beirut': 'LB', 'istanbul': 'TR', 'ankara': 'TR', 'cairo': 'EG', 'casablanca': 'MA', 'tunis': 'TN', 'nairobi': 'KE', 'dar es salaam': 'TZ', 'kampala': 'UG', 'lagos': 'NG', 'accra': 'GH', 'johannesburg': 'ZA', 'cape town': 'ZA', 'durban': 'ZA', 'pretoria': 'ZA', 'seoul': 'KR', 'busan': 'KR', 'tokyo': 'JP', 'osaka': 'JP', 'yokohama': 'JP', 'nagasaki': 'JP', 'kyoto': 'JP', 'beijing': 'CN', 'shanghai': 'CN', 'taipei': 'TW', 'mumbai': 'IN', 'chennai': 'IN', 'kolkata': 'IN', 'delhi': 'IN', 'new delhi': 'IN', 'bangalore': 'IN', 'bengaluru': 'IN', 'hyderabad': 'IN', 'goa': 'IN', 'colombo': 'LK', 'matara': 'LK', 'kandy': 'LK', 'galle': 'LK', 'kuala lumpur': 'MY', 'penang': 'MY', 'manila': 'PH', 'cebu': 'PH', 'bangkok': 'TH', 'chiang mai': 'TH', 'hà nội': 'VN', 'hanoi': 'VN', 'ho chi minh': 'VN', 'saigon': 'VN', 'phnom penh': 'KH', 'vientiane': 'LA', 'sydney': 'AU', 'melbourne': 'AU', 'brisbane': 'AU', 'perth': 'AU', 'adelaide': 'AU', 'auckland': 'NZ', 'wellington': 'NZ', 'christchurch': 'NZ', 'toronto': 'CA', 'vancouver': 'CA', 'montreal': 'CA', 'ottawa': 'CA', 'mexico city': 'MX', 'guadalajara': 'MX', 'monterrey': 'MX', 'são paulo': 'BR', 'rio de janeiro': 'BR', 'brasília': 'BR', 'buenos aires': 'AR', 'bogotá': 'CO', 'lima': 'PE', 'santiago': 'CL', 'vaduz': 'LI', 'monaco': 'MC', 'valletta': 'MT', 'nicosia': 'CY', 'zagreb': 'HR', 'ljubljana': 'SI', 'bratislava': 'SK', 'bucharest': 'RO', 'sofia': 'BG', 'belgrade': 'RS', 'nadi': 'FJ', 'suva': 'FJ', 'san juan': 'PR', 'viejo san juan': 'PR', // Cities missed in the first pass 'calais': 'FR', 'lourdes': 'FR', 'nice': 'FR', 'montpellier': 'FR', 'toulon': 'FR', 'abidjan': 'CI', 'douala': 'CM', 'yaoundé': 'CM', 'kinshasa': 'CD', 'lusaka': 'ZM', 'harare': 'ZW', 'maputo': 'MZ', 'antananarivo': 'MG', 'dakar': 'SN', 'pademangan': 'ID', 'jakarta utara': 'ID', 'denpasar': 'ID', 'semarang': 'ID', 'makassar': 'ID', 'medan': 'ID', 'bogor': 'ID', 'malang': 'ID', 'palembang': 'ID', '서울': 'KR', '부산': 'KR', // Seoul, Busan in Korean // Japanese city names in kanji '東京': 'JP', '大阪': 'JP', '横浜': 'JP', '名古屋': 'JP', '長崎': 'JP', '京都': 'JP', '神戸': 'JP', '福岡': 'JP', '札幌': 'JP', '仙台': 'JP', '広島': 'JP', // Chinese city names in hanzi '北京': 'CN', '上海': 'CN', '深圳': 'CN', '广州': 'CN', '香港': 'HK', // More missing cities 'kuching': 'MY', 'kota kinabalu': 'MY', 'ipoh': 'MY', 'johor bahru': 'MY', 'sarawak': 'MY', 'trondheim': 'NO', 'stavanger': 'NO', 'tromsø': 'NO', 'taastrup': 'DK', 'odense': 'DK', 'cancún': 'MX', 'playa del carmen': 'MX', 'mérida': 'MX', 'puebla': 'MX', 'cancun': 'MX', 'addis ababa': 'ET', 'la paz': 'BO', 'cochabamba': 'BO', 'santa cruz': 'BO', 'willemstad': 'CW', 'curaçao': 'CW', 'curacao': 'CW', 'port of spain': 'TT', 'bridgetown': 'BB', 'nassau': 'BS', 'phnom penh': 'KH', 'siem reap': 'KH', 'port moresby': 'PG', 'ulaanbaatar': 'MN', 'praia': 'CV', 'cape verde': 'CV', 'celebration': 'US', // Celebration, Florida — city not great, but helps 'the gambia': 'GM', 'gambia': 'GM', 'banjul': 'GM', 'playa blanca': 'ES', 'gran canaria': 'ES', 'fuerteventura': 'ES', 'lanzarote': 'ES', 'tirana': 'AL', 'durrës': 'AL', 'podgorica': 'ME', 'budva': 'ME', 'skopje': 'MK', 'pristina': 'XK', 'sarajevo': 'BA', 'minsk': 'BY', 'chișinău': 'MD', 'chisinau': 'MD', 'bishkek': 'KG', 'dushanbe': 'TJ', 'tashkent': 'UZ', 'almaty': 'KZ', 'astana': 'KZ', 'lekki': 'NG', 'abuja': 'NG', 'enugu': 'NG', 'yaba': 'NG', 'ikeja': 'NG', // Serbian 'beograd': 'RS', 'novi sad': 'RS', // Thai 'phuket': 'TH', 'pattaya': 'TH', 'hua hin': 'TH', // Spanish cities 'alicante': 'ES', 'zaragoza': 'ES', 'murcia': 'ES', 'palma': 'ES', 'granada': 'ES', 'córdoba': 'ES', 'santander': 'ES', 'cádiz': 'ES', 'san sebastián': 'ES', 'las palmas': 'ES', 'santa cruz de tenerife': 'ES', // Belgian 'woluwe': 'BE', 'ixelles': 'BE', 'schaerbeek': 'BE', 'liège': 'BE', 'namur': 'BE', // Portuguese 'loulé': 'PT', 'albufeira': 'PT', 'coimbra': 'PT', 'braga': 'PT', 'funchal': 'PT', // Turkish 'mersin': 'TR', 'izmir': 'TR', 'antalya': 'TR', 'trabzon': 'TR', // Lebanese (French spelling) 'beyrouth': 'LB', // Burkina Faso 'ouagadougou': 'BF', 'bobo-dioulasso': 'BF', // Greek 'heraklion': 'GR', 'ηράκλειο': 'GR', 'μυτιλήνη': 'GR', 'αθήνα': 'GR', 'athens': 'GR', 'thessaloniki': 'GR', 'patras': 'GR', // Bulgarian (transliterated) 'plovdiv': 'BG', 'пловдив': 'BG', 'варна': 'BG', // Vietnamese with diacritics 'sài gòn': 'VN', 'hồ chí minh': 'VN', 'đà nẵng': 'VN', // Moldovan 'chişinău': 'MD', // Hungarian 'ferenciek': 'HU', 'debrecen': 'HU', 'szeged': 'HU', 'pécs': 'HU', // Polish cities 'kalisz': 'PL', 'gdańsk': 'PL', 'wrocław': 'PL', 'poznań': 'PL', 'łódź': 'PL', 'katowice': 'PL', 'lublin': 'PL', 'szczecin': 'PL', // Bermuda 'warwick': 'BM', // Maltese 'sliema': 'MT', 'valletta': 'MT', }; // Postal code / state code patterns const POSTAL_PATTERNS: Array<{ pattern: RegExp; country: string }> = [ { pattern: /\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b/, country: 'GB' }, // UK postcode { pattern: /\b[A-Z]\d{2}\s*[A-Z0-9]{4}\b/, country: 'IE' }, // Irish Eircode { pattern: /\bCittà del Vaticano\b/i, country: 'VA' }, { pattern: /\b\d{3}\s*\d{3}\b/, country: 'IN' }, // Indian 6-digit with optional space { pattern: /\bNSW\s+\d{4}\b/, country: 'AU' }, // Australian state codes { pattern: /\bVIC\s+\d{4}\b/, country: 'AU' }, { pattern: /\bQLD\s+\d{4}\b/, country: 'AU' }, { pattern: /\bSA\s+\d{4}\b/, country: 'AU' }, { pattern: /\bWA\s+\d{4}\b/, country: 'AU' }, { pattern: /\bTAS\s+\d{4}\b/, country: 'AU' }, { pattern: /\bACT\s+\d{4}\b/, country: 'AU' }, { pattern: /\bNT\s+\d{4}\b/, country: 'AU' }, { pattern: /\bA-\d{4}\b/, country: 'AT' }, // Austrian postal prefix { pattern: /\b, PR,?\s*\d{5}\b/, country: 'PR' }, // Puerto Rico { pattern: /\b\d{2}-\d{3}\b/, country: 'PL' }, // Polish postal code (XX-XXX) // US state abbreviation + ZIP (e.g., "NY, 11201" or "NY 11201") { pattern: /\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY)[,\s]+\d{5}\b/, country: 'US' }, ]; // ─── Types ─────────────────────────────────────────────────────────────────── interface ParsedChurch { churchId: string; // weekdaymasses numeric ID name: string; latitude: number; longitude: number; address: string | null; phone: string | null; website: string | null; country: string; schedules: ParsedSchedule[]; } interface ParsedSchedule { dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat time: string; // "07:00", "18:30" language: string; notes: string | null; } interface ImportStats { churchesParsed: number; churchesMatched: number; churchesCreated: number; churchesSkipped: number; massSchedulesCreated: number; errors: number; } interface CLIArgs { all: boolean; area?: string; dryRun: boolean; resumeFrom: number; jobId?: string; } // ─── CLI ───────────────────────────────────────────────────────────────────── function parseArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { all: false, dryRun: false, resumeFrom: 0 }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--all': result.all = true; break; case '--area': result.area = args[++i]; break; case '--dry-run': result.dryRun = true; break; case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break; case '--job-id': result.jobId = args[++i]; break; case '--help': console.log(`Usage: npx tsx scripts/import-weekdaymasses.ts [options] --all Import all 3 area pages (gb, ireland, outside-gb) --area Import specific area (gb, ireland, outside-gb) --dry-run No database writes --resume-from Skip first N churches --job-id Background job tracking`); process.exit(0); } } if (!result.all && !result.area) { console.error('Error: specify --all or --area '); process.exit(1); } return result; } // ─── HTTP ──────────────────────────────────────────────────────────────────── async function fetchPage(url: string): Promise { try { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, }); if (!response.ok) { console.error(` HTTP ${response.status} for ${url}`); return null; } return await response.text(); } catch (error) { console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); return null; } } // ─── HTML Parsing ──────────────────────────────────────────────────────────── /** * Extract all church divs from an area page HTML. * Each church is a `
...
`. */ function extractChurchBlocks(html: string): string[] { const blocks: string[] = []; const regex = //g; let match: RegExpExecArray | null; while ((match = regex.exec(html)) !== null) { const start = match.index; // Find the closing at the right nesting level let depth = 1; let pos = start + match[0].length; while (depth > 0 && pos < html.length) { const nextOpen = html.indexOf('', pos); if (nextClose === -1) break; if (nextOpen !== -1 && nextOpen < nextClose) { depth++; pos = nextOpen + 4; } else { depth--; pos = nextClose + 6; } } blocks.push(html.substring(start, pos)); } return blocks; } /** * Parse a single church block HTML into structured data. */ function parseChurchBlock(html: string, defaultCountry: string): ParsedChurch | null { // Church ID from div id="pNNNNN" const idMatch = html.match(/id="p(\d+)"/); if (!idMatch) return null; const churchId = idMatch[1]; // Name from h3 const nameMatch = html.match(/

(.*?)<\/h3>/s); if (!nameMatch) return null; const name = decodeHtmlEntities(nameMatch[1].trim()); // Coordinates from map link let latitude = 0; let longitude = 0; const mapMatch = html.match(/lat=(-?[\d.]+)&(?:amp;)?lon=(-?[\d.]+)/); if (mapMatch) { latitude = parseFloat(mapMatch[1]); longitude = parseFloat(mapMatch[2]); } // Address from p.address — text after the
tag let address: string | null = null; const addressMatch = html.match(/([\s\S]*?)<\/p>/); if (addressMatch) { const addressHtml = addressMatch[1]; // Get text after last
(or after Streetview link) const brIdx = addressHtml.lastIndexOf('/, '').trim(); address = stripHtmlTags(textAfterTag).trim() || null; } } // Phone from p.telephone let phone: string | null = null; const phoneMatch = html.match(/[\s\S]*?Tel:<\/span>\s*(.*?)<\/p>/); if (phoneMatch) { phone = phoneMatch[1].trim() || null; } // Website from p.transport with "Link to church website:" let website: string | null = null; const websiteMatch = html.match(/Link to church website:<\/span>\s*]+href="([^"]+)"/); if (websiteMatch) { website = websiteMatch[1]; } // Country detection let country = defaultCountry; if (!country && address) { country = detectCountry(address); } if (!country) country = 'XX'; // Unknown // Mass schedules from p.times const schedules = parseScheduleBlocks(html); return { churchId, name, latitude, longitude, address, phone, website, country, schedules }; } /** * Detect country from address text using three strategies: * 1. Country name anywhere in address * 2. City/region name matching * 3. Postal code / state code patterns */ function detectCountry(address: string): string { // Clean address for matching const cleaned = address.replace(/\r?\n/g, ' ').trim(); const lower = cleaned.toLowerCase(); // 1. Country name match (check longer names first to avoid "India" matching in "Indiana") const sortedCountries = Object.entries(COUNTRY_NAME_MAP) .sort((a, b) => b[0].length - a[0].length); for (const [name, code] of sortedCountries) { if (lower.includes(name)) return code; } // 2. City/region match for (const [city, code] of Object.entries(CITY_COUNTRY_MAP)) { // Use word boundary-like check to avoid partial matches const idx = lower.indexOf(city); if (idx !== -1) { const before = idx > 0 ? lower[idx - 1] : ' '; const after = idx + city.length < lower.length ? lower[idx + city.length] : ' '; if (/[\s,.(]/.test(before) || idx === 0) { if (/[\s,.):\r\n]/.test(after) || idx + city.length === lower.length) { return code; } } } } // 3. US state detection — match "State, ZIPCODE" or "State ZIPCODE" patterns // to avoid false positives from country names like "Georgia" const US_STATES = [ 'alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'florida', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 'new york', 'north carolina', 'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode island', 'south carolina', 'south dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington', 'west virginia', 'wisconsin', 'wyoming', 'georgia', // Georgia last — also a country, but with ZIP it's US ]; for (const state of US_STATES) { const idx = lower.indexOf(state); if (idx !== -1) { // Check if followed by comma+ZIP or just ZIP (US address pattern) const after = cleaned.substring(idx + state.length); if (/^[,\s]+\d{5}/.test(after)) return 'US'; // Also match state abbreviation patterns like "FL 34747" — 2-letter code + ZIP if (state !== 'georgia' && /^[,\s]*$/.test(after)) return 'US'; } } // 4. Postal code patterns for (const { pattern, country } of POSTAL_PATTERNS) { if (pattern.test(cleaned)) return country; } return ''; } /** * Parse all

blocks into schedule entries. */ function parseScheduleBlocks(html: string): ParsedSchedule[] { const schedules: ParsedSchedule[] = []; const timesRegex = /([\s\S]*?)<\/p>/g; let match: RegExpExecArray | null; while ((match = timesRegex.exec(html)) !== null) { const text = stripHtmlTags(match[1]) .replace(/\s+/g, ' ') .trim(); const parsed = parseTimesLine(text); schedules.push(...parsed); } return schedules; } /** * Parse a single mass times line like: * "Sunday: 6.30am(Tamil), 8.30am(Tamil), 5.30pm(English)" * "Weekday: Monday, Tuesday, Wednesday 6.15am" * "Mon Tue Wed Thu Fri: 6.30am(Tamil)" */ function parseTimesLine(text: string): ParsedSchedule[] { const schedules: ParsedSchedule[] = []; // Split on colon — left side is days, right side is times const colonIdx = text.indexOf(':'); if (colonIdx === -1) return schedules; const dayPart = text.substring(0, colonIdx).trim(); const timePart = text.substring(colonIdx + 1).trim(); // Parse default day numbers from the day label const defaultDays = parseDayLabel(dayPart); // Split on semicolons to handle "Monday 10.00am; Thursday 7.30pm" patterns const parts = timePart.split(';').map(p => p.trim()).filter(Boolean); for (const part of parts) { // Check if specific day names appear at the start of this part const { specificDays, cleanedTimePart } = extractSpecificDays(part); const days = specificDays.length > 0 ? specificDays : defaultDays; if (days.length === 0) continue; const timeStr = specificDays.length > 0 ? cleanedTimePart : part; // Extract individual time entries: "7.00am(Tamil), 8.30am(English), ..." const timeEntries = extractTimeEntries(timeStr); for (const entry of timeEntries) { const time24 = convertTo24h(entry.time); if (!time24) continue; for (const day of days) { schedules.push({ dayOfWeek: day, time: time24, language: entry.language, notes: entry.notes, }); } } } return schedules; } // Day name mappings const DAY_MAP: Record = { 'sunday': 0, 'sun': 0, 'monday': 1, 'mon': 1, 'tuesday': 2, 'tue': 2, 'wednesday': 3, 'wed': 3, 'thursday': 4, 'thu': 4, 'friday': 5, 'fri': 5, 'saturday': 6, 'sat': 6, }; /** * Parse day label (left of colon) into day numbers. */ function parseDayLabel(label: string): number[] { const lower = label.toLowerCase().trim(); // "Weekday" = Mon-Fri if (lower === 'weekday' || lower === 'weekdays') { return [1, 2, 3, 4, 5]; } // "Holy Day" or "Holyday" if (lower.includes('holy day') || lower.includes('holyday')) { return []; // Skip holy days — not a regular schedule } // Try to parse individual day names from the label // e.g., "Mon Tue Wed Thu Fri" or "Monday Tuesday" const days: number[] = []; const words = lower.split(/[\s,]+/); for (const word of words) { const dayNum = DAY_MAP[word]; if (dayNum !== undefined) { days.push(dayNum); } } return days; } /** * Check if the time part starts with specific day names. * e.g., "Monday, Tuesday, Wednesday 6.15am" -> days=[1,2,3], cleaned="6.15am" */ function extractSpecificDays(timePart: string): { specificDays: number[]; cleanedTimePart: string } { const days: number[] = []; let remaining = timePart; // Match day names at the start, separated by commas/spaces const dayPattern = /^((?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)[,\s]*)+)/i; const match = remaining.match(dayPattern); if (match) { const dayStr = match[1]; const words = dayStr.split(/[\s,]+/).filter(Boolean); let allAreDays = true; for (const word of words) { const lower = word.toLowerCase(); if (DAY_MAP[lower] !== undefined) { days.push(DAY_MAP[lower]); } else { allAreDays = false; break; } } if (allAreDays && days.length > 0) { remaining = remaining.substring(match[0].length).trim(); } else { days.length = 0; // Reset if not all words were days } } return { specificDays: days, cleanedTimePart: remaining }; } interface TimeEntry { time: string; // Raw time: "7.00am", "6.30pm" language: string; notes: string | null; } /** * Extract time entries from a times string. * e.g., "7.00am(Tamil), 8.30am(English), 12.00pm" -> [{time: "7.00am", language: "Tamil"}, ...] */ function extractTimeEntries(text: string): TimeEntry[] { const entries: TimeEntry[] = []; // Match time patterns: digits.digitsam/pm optionally followed by (annotation) const pattern = /(\d{1,2}\.\d{2}\s*(?:am|pm))(?:\s*\(([^)]*)\))?/gi; let match: RegExpExecArray | null; while ((match = pattern.exec(text)) !== null) { const rawTime = match[1].replace(/\s/g, ''); const annotation = match[2]?.trim() || null; let language = 'English'; let notes: string | null = null; if (annotation) { if (KNOWN_LANGUAGES.has(annotation.toLowerCase())) { language = annotation.charAt(0).toUpperCase() + annotation.slice(1).toLowerCase(); } else { notes = annotation; } } entries.push({ time: rawTime, language, notes }); } return entries; } /** * Convert time from "H.MMam/pm" format to "HH:MM" 24h format. */ function convertTo24h(time: string): string | null { const match = time.match(/^(\d{1,2})\.(\d{2})(am|pm)$/i); if (!match) return null; let hours = parseInt(match[1], 10); const mins = parseInt(match[2], 10); const period = match[3].toLowerCase(); if (mins < 0 || mins > 59) return null; if (period === 'am') { if (hours === 12) hours = 0; } else { if (hours !== 12) hours += 12; } if (hours < 0 || hours > 23) return null; return `${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`; } // ─── Helpers ──────────────────────────────────────────────────────────────── function stripHtmlTags(html: string): string { return html.replace(/<[^>]+>/g, ''); } function decodeHtmlEntities(text: string): string { return text .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/'/g, "'") .replace(/'/g, "'") .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10))); } // ─── Job Management ───────────────────────────────────────────────────────── async function createOrResumeJob(jobId?: string): Promise { if (jobId) { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: 'running', startedAt: new Date() }, }); return jobId; } return null; } async function completeJob(jobId: string | null, error?: string): Promise { if (!jobId) return; try { await prisma.backgroundJob.update({ where: { id: jobId }, data: { status: error ? 'failed' : 'completed', error: error || null, completedAt: new Date(), }, }); } catch (err) { console.error(`Failed to update job ${jobId}:`, err); } } async function updateJobProgress(jobId: string | null, stats: ImportStats, total: number, processed: number): Promise { if (!jobId) return; try { await prisma.backgroundJob.update({ where: { id: jobId }, data: { totalItems: total, processed, succeeded: stats.churchesMatched + stats.churchesCreated, failed: stats.errors, itemsFound: stats.churchesParsed, }, }); } catch (err) { console.error(`Failed to update job progress:`, err); } } // ─── Database ─────────────────────────────────────────────────────────────── async function loadExistingChurches(): Promise { const churches = await prisma.church.findMany({ where: { OR: [ { weekdayMassesId: { not: null } }, { country: { in: ['GB', 'IE', 'IN', 'LK', 'FR', 'IT', 'VA', 'PT', 'ES', 'KR', 'JP', 'PH', 'SG', 'MY', 'HK'] } }, ], }, select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, discovermassId: true, source: true, website: true, phone: true, address: true, country: true, }, }); return churches; } // ─── Main Import ──────────────────────────────────────────────────────────── async function importAreaBlocks( areaName: string, config: { url: string; defaultCountry: string }, blocks: string[], existingChurches: ExistingChurch[], stats: ImportStats, dryRun: boolean, resumeFrom: number, jobId: string | null, globalProcessed: number, globalTotal: number, ): Promise { console.log(`\nProcessing ${areaName}: ${blocks.length} churches`); const startTime = Date.now(); for (let i = 0; i < blocks.length; i++) { const absoluteIndex = globalProcessed + i; if (absoluteIndex < resumeFrom) continue; const church = parseChurchBlock(blocks[i], config.defaultCountry); if (!church) { stats.errors++; continue; } stats.churchesParsed++; if (dryRun) { if (stats.churchesParsed <= 20) { const elapsed = ((Date.now() - startTime) / 1000).toFixed(0); console.log(` [${areaName}] ${church.name} (${church.country}) — ${church.schedules.length} schedules, coords: ${church.latitude.toFixed(4)}, ${church.longitude.toFixed(4)} [${elapsed}s]`); } continue; } try { const candidate = { name: church.name, lat: church.latitude, lng: church.longitude, weekdayMassesId: church.churchId, }; const duplicate = findDuplicateChurch(candidate, existingChurches); if (duplicate) { // Update existing church const updateData: Record = { weekdayMassesId: church.churchId, lastScrapedAt: new Date(), }; // Only fill in missing fields if (!duplicate.phone && church.phone) updateData.phone = church.phone; if (!duplicate.website && church.website) { updateData.website = church.website; updateData.hasWebsite = true; } if (!duplicate.address && church.address) updateData.address = church.address; // Update country if existing is unknown (XX) and we detected a real one if (duplicate.country === 'XX' && church.country !== 'XX') { updateData.country = church.country; } try { await prisma.church.update({ where: { id: duplicate.id }, data: updateData, }); } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; continue; } throw error; } // Replace mass schedules if we have new ones if (church.schedules.length > 0) { await prisma.$transaction(async (tx) => { await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.massSchedule.createMany({ data: church.schedules.map((s) => ({ churchId: duplicate.id, dayOfWeek: s.dayOfWeek, time: s.time, language: s.language, notes: s.notes, })), }); }); stats.massSchedulesCreated += church.schedules.length; } stats.churchesMatched++; } else { // Create new church try { const newChurch = await prisma.church.create({ data: { name: church.name, latitude: church.latitude, longitude: church.longitude, address: church.address, country: church.country, phone: church.phone, website: church.website, hasWebsite: !!church.website, weekdayMassesId: church.churchId, source: 'weekdaymasses', lastScrapedAt: church.schedules.length > 0 ? new Date() : null, }, }); // Create mass schedules if (church.schedules.length > 0) { await prisma.massSchedule.createMany({ data: church.schedules.map((s) => ({ churchId: newChurch.id, dayOfWeek: s.dayOfWeek, time: s.time, language: s.language, notes: s.notes, })), }); stats.massSchedulesCreated += church.schedules.length; } // Add to in-memory dedup list existingChurches.push({ id: newChurch.id, name: church.name, latitude: church.latitude, longitude: church.longitude, osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: null, mszeInfoId: null, weekdayMassesId: church.churchId, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, discovermassId: null, source: 'weekdaymasses', website: church.website, phone: church.phone, address: church.address, country: church.country, }); stats.churchesCreated++; } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; continue; } throw error; } } } catch (error) { console.error(` Error processing ${church.name} (${church.churchId}): ${error instanceof Error ? error.message : error}`); stats.errors++; } // Progress logging const totalProcessed = absoluteIndex + 1; if (totalProcessed % 500 === 0) { const elapsed = ((Date.now() - startTime) / 1000).toFixed(0); console.log(` Progress: ${totalProcessed}/${globalTotal} [${elapsed}s]`); await updateJobProgress(jobId, stats, globalTotal, totalProcessed); } } return globalProcessed + blocks.length; } // ─── Main ─────────────────────────────────────────────────────────────────── async function main() { const args = parseArgs(); const jobId = await createOrResumeJob(args.jobId); console.log(`\n${'='.repeat(70)}`); console.log('WEEKDAYMASSES.ORG.UK IMPORTER'); console.log('='.repeat(70)); console.log(`Mode: ${args.all ? 'All areas' : `Area: ${args.area}`}`); console.log(`Dry run: ${args.dryRun ? 'YES' : 'NO'}`); if (args.resumeFrom > 0) console.log(`Resume from: ${args.resumeFrom}`); console.log(`Time: ${new Date().toISOString()}`); console.log('='.repeat(70)); const stats: ImportStats = { churchesParsed: 0, churchesMatched: 0, churchesCreated: 0, churchesSkipped: 0, massSchedulesCreated: 0, errors: 0, }; // Determine which areas to import const areas: Array<[string, { url: string; defaultCountry: string }]> = []; if (args.all) { areas.push(...Object.entries(AREA_PAGES)); } else if (args.area) { const config = AREA_PAGES[args.area]; if (!config) { console.error(`Unknown area: ${args.area}. Valid: ${Object.keys(AREA_PAGES).join(', ')}`); process.exit(1); } areas.push([args.area, config]); } // Load existing churches for deduplication if (!args.dryRun) { console.log('\nLoading existing churches for deduplication...'); } const existingChurches = args.dryRun ? [] : await loadExistingChurches(); if (!args.dryRun) { console.log(`Loaded ${existingChurches.length} existing churches`); } // Pre-fetch all area pages to get accurate total count for progress tracking console.log('\nFetching area pages...'); const fetchedAreas: Array<{ name: string; config: { url: string; defaultCountry: string }; blocks: string[] }> = []; let globalTotal = 0; for (const [areaName, config] of areas) { console.log(` Fetching ${areaName}: ${SITE_BASE}${config.url}`); const html = await fetchPage(`${SITE_BASE}${config.url}`); if (!html) { console.error(` Failed to fetch ${areaName} page`); continue; } console.log(` Page size: ${(html.length / 1024 / 1024).toFixed(1)} MB`); const blocks = extractChurchBlocks(html); console.log(` Found ${blocks.length} church blocks`); globalTotal += blocks.length; fetchedAreas.push({ name: areaName, config, blocks }); } console.log(`\nTotal churches across all areas: ${globalTotal}`); let globalProcessed = 0; try { for (const { name: areaName, config, blocks } of fetchedAreas) { globalProcessed = await importAreaBlocks( areaName, config, blocks, existingChurches, stats, args.dryRun, args.resumeFrom, jobId, globalProcessed, globalTotal, ); } // Print summary console.log(`\n${'='.repeat(70)}`); console.log(`WEEKDAYMASSES IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); console.log('='.repeat(70)); console.log(`Churches parsed: ${stats.churchesParsed}`); if (!args.dryRun) { console.log(`Churches matched: ${stats.churchesMatched}`); console.log(`Churches created: ${stats.churchesCreated}`); console.log(`Churches skipped: ${stats.churchesSkipped} (duplicates)`); console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); } if (stats.errors > 0) { console.log(`Errors: ${stats.errors}`); } console.log('='.repeat(70)); await completeJob(jobId); } catch (error) { console.error('Fatal error:', error); await completeJob(jobId, String(error)); process.exit(1); } finally { await prisma.$disconnect(); } } main();