Add discovermassId field to ExistingChurch interface and ChurchCandidate type, insert a dedicated matching pass in findDuplicateChurch, and update all 15 importer push blocks plus 16 loadExistingChurches select queries to include the new field. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1122 lines
42 KiB
TypeScript
1122 lines
42 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Import Catholic churches and mass schedules from weekdaymasses.org.uk
|
|
*
|
|
* weekdaymasses.org.uk covers ~4,000+ churches globally (GB, Ireland, and 49+
|
|
* international countries). All data is served on single HTML pages per area.
|
|
*
|
|
* Import strategy:
|
|
* 1. Fetch area pages (gb, ireland, outside-gb)
|
|
* 2. Parse `.church` divs for name, coordinates, address, phone, website, mass times
|
|
* 3. Convert mass times from H.MMam/pm to HH:MM 24h format
|
|
* 4. Detect country from address patterns (for outside-gb)
|
|
* 5. Match against existing churches, upsert with mass schedules
|
|
*
|
|
* Usage:
|
|
* npx tsx scripts/import-weekdaymasses.ts --all
|
|
* npx tsx scripts/import-weekdaymasses.ts --area gb
|
|
* npx tsx scripts/import-weekdaymasses.ts --area outside-gb --dry-run
|
|
* npx tsx scripts/import-weekdaymasses.ts --all --resume-from 500
|
|
* npx tsx scripts/import-weekdaymasses.ts --all --job-id {uuid}
|
|
*/
|
|
|
|
import dotenv from 'dotenv';
|
|
import path from 'path';
|
|
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
|
|
|
import { Pool } from 'pg';
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
|
import { PrismaClient } from '@prisma/client';
|
|
|
|
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
|
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
|
const pool = new Pool({
|
|
connectionString: dbUrl,
|
|
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
|
});
|
|
const adapter = new PrismaPg(pool);
|
|
const prisma = new PrismaClient({ adapter });
|
|
|
|
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
|
import type { ExistingChurch } from '../src/lib/church-matcher';
|
|
|
|
// ─── Constants ───────────────────────────────────────────────────────────────
|
|
|
|
const SITE_BASE = 'https://weekdaymasses.org.uk';
|
|
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
|
|
|
const AREA_PAGES: Record<string, { url: string; defaultCountry: string }> = {
|
|
'gb': { url: '/en/area/gb/churches', defaultCountry: 'GB' },
|
|
'ireland': { url: '/en/area/ireland/churches', defaultCountry: 'IE' },
|
|
'outside-gb': { url: '/en/area/outside-gb/churches', defaultCountry: '' }, // needs detection
|
|
};
|
|
|
|
// Known languages that may appear in parentheses after mass times
|
|
const KNOWN_LANGUAGES = new Set([
|
|
'english', 'tamil', 'sinhala', 'sinhalese', 'french', 'spanish', 'portuguese',
|
|
'polish', 'italian', 'german', 'latin', 'korean', 'japanese', 'chinese',
|
|
'mandarin', 'cantonese', 'tagalog', 'filipino', 'hindi', 'malayalam',
|
|
'konkani', 'telugu', 'kannada', 'marathi', 'bengali', 'urdu', 'arabic',
|
|
'vietnamese', 'indonesian', 'malay', 'dutch', 'hungarian', 'czech', 'slovak',
|
|
'slovenian', 'croatian', 'swahili', 'igbo', 'yoruba', 'ga', 'twi', 'ewe',
|
|
'shona', 'zulu', 'sesotho', 'afrikaans',
|
|
]);
|
|
|
|
// Country name patterns (matched anywhere in address, no $ anchor — addresses have trailing \r\n)
|
|
const COUNTRY_NAME_MAP: Record<string, string> = {
|
|
'india': 'IN', 'sri lanka': 'LK', 'france': 'FR', 'italy': 'IT', 'spain': 'ES',
|
|
'portugal': 'PT', 'germany': 'DE', 'south korea': 'KR', 'korea': 'KR', 'japan': 'JP',
|
|
'philippines': 'PH', 'singapore': 'SG', 'malaysia': 'MY', 'hong kong': 'HK',
|
|
'thailand': 'TH', 'indonesia': 'ID', 'vietnam': 'VN', 'pakistan': 'PK',
|
|
'bangladesh': 'BD', 'nepal': 'NP', 'myanmar': 'MM', 'nigeria': 'NG', 'ghana': 'GH',
|
|
'kenya': 'KE', 'tanzania': 'TZ', 'uganda': 'UG', 'south africa': 'ZA',
|
|
'australia': 'AU', 'new zealand': 'NZ', 'canada': 'CA', 'belgium': 'BE',
|
|
'netherlands': 'NL', 'luxembourg': 'LU', 'switzerland': 'CH', 'austria': 'AT',
|
|
'poland': 'PL', 'hungary': 'HU', 'czech republic': 'CZ', 'czechia': 'CZ',
|
|
'mexico': 'MX', 'brazil': 'BR', 'argentina': 'AR', 'colombia': 'CO', 'peru': 'PE',
|
|
'chile': 'CL', 'china': 'CN', 'taiwan': 'TW', 'ireland': 'IE', 'malta': 'MT',
|
|
'cyprus': 'CY', 'croatia': 'HR', 'slovenia': 'SI', 'romania': 'RO', 'slovakia': 'SK',
|
|
'senegal': 'SN', 'grenada': 'GD', 'greece': 'GR', 'denmark': 'DK', 'sweden': 'SE',
|
|
'norway': 'NO', 'finland': 'FI', 'iceland': 'IS', 'latvia': 'LV', 'lithuania': 'LT',
|
|
'estonia': 'EE', 'ukraine': 'UA', 'russia': 'RU', 'georgia': 'GE', 'armenia': 'AM',
|
|
'jordan': 'JO', 'lebanon': 'LB', 'israel': 'IL', 'turkey': 'TR', 'egypt': 'EG',
|
|
'morocco': 'MA', 'tunisia': 'TN', 'cameroon': 'CM', 'ethiopia': 'ET',
|
|
'madagascar': 'MG', 'mozambique': 'MZ', 'zambia': 'ZM', 'zimbabwe': 'ZW',
|
|
'trinidad': 'TT', 'trinidad and tobago': 'TT', 'jamaica': 'JM', 'barbados': 'BB',
|
|
'bahamas': 'BS', 'bermuda': 'BM', 'costa rica': 'CR', 'panama': 'PA',
|
|
'guatemala': 'GT', 'honduras': 'HN', 'el salvador': 'SV', 'nicaragua': 'NI',
|
|
'ecuador': 'EC', 'venezuela': 'VE', 'bolivia': 'BO', 'paraguay': 'PY', 'uruguay': 'UY',
|
|
'puerto rico': 'PR', 'fiji': 'FJ', 'samoa': 'WS', 'tonga': 'TO', 'guam': 'GU',
|
|
'liechtenstein': 'LI', 'monaco': 'MC', 'andorra': 'AD', 'san marino': 'SM',
|
|
'serbia': 'RS', 'bosnia': 'BA', 'montenegro': 'ME', 'north macedonia': 'MK',
|
|
'albania': 'AL', 'kosovo': 'XK', 'bulgaria': 'BG', 'moldova': 'MD', 'belarus': 'BY',
|
|
'kazakhstan': 'KZ', 'uzbekistan': 'UZ', 'kyrgyzstan': 'KG', 'tajikistan': 'TJ',
|
|
'cambodia': 'KH', 'laos': 'LA', 'brunei': 'BN', 'east timor': 'TL', 'timor-leste': 'TL',
|
|
'papua new guinea': 'PG', 'mongolia': 'MN',
|
|
'curaçao': 'CW', 'curacao': 'CW', 'cape verde': 'CV', 'cabo verde': 'CV',
|
|
'the gambia': 'GM', 'gambia': 'GM',
|
|
'congo': 'CD', 'ivory coast': 'CI', "côte d'ivoire": 'CI', 'burkina faso': 'BF',
|
|
'suriname': 'SR', 'guyana': 'GY', 'belize': 'BZ', 'haiti': 'HT',
|
|
'dominican republic': 'DO', 'cuba': 'CU', 'qatar': 'QA',
|
|
'united arab emirates': 'AE', 'u.a.e.': 'AE', 'uae': 'AE', 'dubai': 'AE', 'abu dhabi': 'AE',
|
|
'saudi arabia': 'SA', 'bahrain': 'BH', 'kuwait': 'KW', 'oman': 'OM',
|
|
'antigua and barbuda': 'AG', 'antigua': 'AG',
|
|
'mauritius': 'MU', 'réunion': 'RE', 'reunion': 'RE', 'seychelles': 'SC',
|
|
'saint lucia': 'LC', 'st. lucia': 'LC', 'dominica': 'DM',
|
|
'saint vincent': 'VC', 'st. vincent': 'VC',
|
|
'saint kitts': 'KN', 'st. kitts': 'KN',
|
|
'u.s. virgin islands': 'VI', 'us virgin islands': 'VI', 'saint croix': 'VI',
|
|
'saint thomas': 'VI', 'virgin islands': 'VI',
|
|
'aruba': 'AW', 'bonaire': 'BQ', 'sint maarten': 'SX',
|
|
'iraq': 'IQ', 'iran': 'IR', 'afghanistan': 'AF',
|
|
'macao': 'MO', 'macau': 'MO',
|
|
};
|
|
|
|
// City/region-based detection fallback (for addresses without country names)
|
|
const CITY_COUNTRY_MAP: Record<string, string> = {
|
|
// Major cities that unambiguously identify a country
|
|
'jakarta': 'ID', 'surabaya': 'ID', 'bandung': 'ID', 'yogyakarta': 'ID',
|
|
'budapest': 'HU', 'berlin': 'DE', 'münchen': 'DE', 'munich': 'DE', 'hamburg': 'DE',
|
|
'köln': 'DE', 'frankfurt': 'DE', 'düsseldorf': 'DE', 'stuttgart': 'DE',
|
|
'paris': 'FR', 'lyon': 'FR', 'marseille': 'FR', 'toulouse': 'FR', 'lille': 'FR',
|
|
'nantes': 'FR', 'bordeaux': 'FR', 'strasbourg': 'FR', 'rennes': 'FR',
|
|
'roma': 'IT', 'rome': 'IT', 'milano': 'IT', 'milan': 'IT', 'napoli': 'IT',
|
|
'torino': 'IT', 'firenze': 'IT', 'florence': 'IT', 'bologna': 'IT', 'genova': 'IT',
|
|
'madrid': 'ES', 'barcelona': 'ES', 'valencia': 'ES', 'sevilla': 'ES', 'seville': 'ES',
|
|
'málaga': 'ES', 'bilbao': 'ES', 'mallorca': 'ES', 'tenerife': 'ES',
|
|
'lisboa': 'PT', 'lisbon': 'PT', 'porto': 'PT', 'faro': 'PT',
|
|
'warszawa': 'PL', 'warsaw': 'PL', 'kraków': 'PL', 'krakow': 'PL',
|
|
'praha': 'CZ', 'prague': 'CZ', 'brno': 'CZ',
|
|
'wien': 'AT', 'vienna': 'AT', 'innsbruck': 'AT', 'salzburg': 'AT', 'graz': 'AT',
|
|
'zürich': 'CH', 'zurich': 'CH', 'genève': 'CH', 'geneva': 'CH', 'bern': 'CH', 'basel': 'CH',
|
|
'amsterdam': 'NL', 'rotterdam': 'NL', 'den haag': 'NL',
|
|
'bruxelles': 'BE', 'brussels': 'BE', 'brugge': 'BE', 'antwerpen': 'BE',
|
|
'københavn': 'DK', 'copenhagen': 'DK', 'aarhus': 'DK', 'aalborg': 'DK',
|
|
'stockholm': 'SE', 'göteborg': 'SE', 'malmö': 'SE',
|
|
'oslo': 'NO', 'bergen': 'NO',
|
|
'helsinki': 'FI',
|
|
'reykjavik': 'IS',
|
|
'riga': 'LV', 'vilnius': 'LT', 'tallinn': 'EE',
|
|
'kyiv': 'UA', 'київ': 'UA', 'lviv': 'UA',
|
|
'москва': 'RU', 'moscow': 'RU', 'санкт-петербург': 'RU', 'магадан': 'RU',
|
|
'калуга': 'RU', 'новосибирск': 'RU', 'владивосток': 'RU',
|
|
'tbilisi': 'GE', 'yerevan': 'AM',
|
|
'amman': 'JO', 'beirut': 'LB', 'istanbul': 'TR', 'ankara': 'TR',
|
|
'cairo': 'EG', 'casablanca': 'MA', 'tunis': 'TN',
|
|
'nairobi': 'KE', 'dar es salaam': 'TZ', 'kampala': 'UG', 'lagos': 'NG', 'accra': 'GH',
|
|
'johannesburg': 'ZA', 'cape town': 'ZA', 'durban': 'ZA', 'pretoria': 'ZA',
|
|
'seoul': 'KR', 'busan': 'KR', 'tokyo': 'JP', 'osaka': 'JP', 'yokohama': 'JP',
|
|
'nagasaki': 'JP', 'kyoto': 'JP', 'beijing': 'CN', 'shanghai': 'CN',
|
|
'taipei': 'TW', 'mumbai': 'IN', 'chennai': 'IN', 'kolkata': 'IN', 'delhi': 'IN',
|
|
'new delhi': 'IN', 'bangalore': 'IN', 'bengaluru': 'IN', 'hyderabad': 'IN', 'goa': 'IN',
|
|
'colombo': 'LK', 'matara': 'LK', 'kandy': 'LK', 'galle': 'LK',
|
|
'kuala lumpur': 'MY', 'penang': 'MY',
|
|
'manila': 'PH', 'cebu': 'PH',
|
|
'bangkok': 'TH', 'chiang mai': 'TH',
|
|
'hà nội': 'VN', 'hanoi': 'VN', 'ho chi minh': 'VN', 'saigon': 'VN',
|
|
'phnom penh': 'KH', 'vientiane': 'LA',
|
|
'sydney': 'AU', 'melbourne': 'AU', 'brisbane': 'AU', 'perth': 'AU', 'adelaide': 'AU',
|
|
'auckland': 'NZ', 'wellington': 'NZ', 'christchurch': 'NZ',
|
|
'toronto': 'CA', 'vancouver': 'CA', 'montreal': 'CA', 'ottawa': 'CA',
|
|
'mexico city': 'MX', 'guadalajara': 'MX', 'monterrey': 'MX',
|
|
'são paulo': 'BR', 'rio de janeiro': 'BR', 'brasília': 'BR',
|
|
'buenos aires': 'AR', 'bogotá': 'CO', 'lima': 'PE', 'santiago': 'CL',
|
|
'vaduz': 'LI', 'monaco': 'MC',
|
|
'valletta': 'MT', 'nicosia': 'CY', 'zagreb': 'HR', 'ljubljana': 'SI',
|
|
'bratislava': 'SK', 'bucharest': 'RO', 'sofia': 'BG', 'belgrade': 'RS',
|
|
'nadi': 'FJ', 'suva': 'FJ',
|
|
'san juan': 'PR', 'viejo san juan': 'PR',
|
|
// Cities missed in the first pass
|
|
'calais': 'FR', 'lourdes': 'FR', 'nice': 'FR', 'montpellier': 'FR', 'toulon': 'FR',
|
|
'abidjan': 'CI', 'douala': 'CM', 'yaoundé': 'CM', 'kinshasa': 'CD', 'lusaka': 'ZM',
|
|
'harare': 'ZW', 'maputo': 'MZ', 'antananarivo': 'MG', 'dakar': 'SN',
|
|
'pademangan': 'ID', 'jakarta utara': 'ID', 'denpasar': 'ID', 'semarang': 'ID',
|
|
'makassar': 'ID', 'medan': 'ID', 'bogor': 'ID', 'malang': 'ID', 'palembang': 'ID',
|
|
'서울': 'KR', '부산': 'KR', // Seoul, Busan in Korean
|
|
// Japanese city names in kanji
|
|
'東京': 'JP', '大阪': 'JP', '横浜': 'JP', '名古屋': 'JP', '長崎': 'JP',
|
|
'京都': 'JP', '神戸': 'JP', '福岡': 'JP', '札幌': 'JP', '仙台': 'JP', '広島': 'JP',
|
|
// Chinese city names in hanzi
|
|
'北京': 'CN', '上海': 'CN', '深圳': 'CN', '广州': 'CN', '香港': 'HK',
|
|
// More missing cities
|
|
'kuching': 'MY', 'kota kinabalu': 'MY', 'ipoh': 'MY', 'johor bahru': 'MY', 'sarawak': 'MY',
|
|
'trondheim': 'NO', 'stavanger': 'NO', 'tromsø': 'NO',
|
|
'taastrup': 'DK', 'odense': 'DK',
|
|
'cancún': 'MX', 'playa del carmen': 'MX', 'mérida': 'MX', 'puebla': 'MX', 'cancun': 'MX',
|
|
'addis ababa': 'ET',
|
|
'la paz': 'BO', 'cochabamba': 'BO', 'santa cruz': 'BO',
|
|
'willemstad': 'CW', 'curaçao': 'CW', 'curacao': 'CW',
|
|
'port of spain': 'TT', 'bridgetown': 'BB', 'nassau': 'BS',
|
|
'phnom penh': 'KH', 'siem reap': 'KH',
|
|
'port moresby': 'PG',
|
|
'ulaanbaatar': 'MN',
|
|
'praia': 'CV', 'cape verde': 'CV',
|
|
'celebration': 'US', // Celebration, Florida — city not great, but helps
|
|
'the gambia': 'GM', 'gambia': 'GM', 'banjul': 'GM',
|
|
'playa blanca': 'ES', 'gran canaria': 'ES', 'fuerteventura': 'ES', 'lanzarote': 'ES',
|
|
'tirana': 'AL', 'durrës': 'AL',
|
|
'podgorica': 'ME', 'budva': 'ME',
|
|
'skopje': 'MK', 'pristina': 'XK', 'sarajevo': 'BA',
|
|
'minsk': 'BY', 'chișinău': 'MD', 'chisinau': 'MD',
|
|
'bishkek': 'KG', 'dushanbe': 'TJ', 'tashkent': 'UZ', 'almaty': 'KZ', 'astana': 'KZ',
|
|
'lekki': 'NG', 'abuja': 'NG', 'enugu': 'NG', 'yaba': 'NG', 'ikeja': 'NG',
|
|
// Serbian
|
|
'beograd': 'RS', 'novi sad': 'RS',
|
|
// Thai
|
|
'phuket': 'TH', 'pattaya': 'TH', 'hua hin': 'TH',
|
|
// Spanish cities
|
|
'alicante': 'ES', 'zaragoza': 'ES', 'murcia': 'ES', 'palma': 'ES',
|
|
'granada': 'ES', 'córdoba': 'ES', 'santander': 'ES', 'cádiz': 'ES',
|
|
'san sebastián': 'ES', 'las palmas': 'ES', 'santa cruz de tenerife': 'ES',
|
|
// Belgian
|
|
'woluwe': 'BE', 'ixelles': 'BE', 'schaerbeek': 'BE', 'liège': 'BE', 'namur': 'BE',
|
|
// Portuguese
|
|
'loulé': 'PT', 'albufeira': 'PT', 'coimbra': 'PT', 'braga': 'PT', 'funchal': 'PT',
|
|
// Turkish
|
|
'mersin': 'TR', 'izmir': 'TR', 'antalya': 'TR', 'trabzon': 'TR',
|
|
// Lebanese (French spelling)
|
|
'beyrouth': 'LB',
|
|
// Burkina Faso
|
|
'ouagadougou': 'BF', 'bobo-dioulasso': 'BF',
|
|
// Greek
|
|
'heraklion': 'GR', 'ηράκλειο': 'GR', 'μυτιλήνη': 'GR', 'αθήνα': 'GR',
|
|
'athens': 'GR', 'thessaloniki': 'GR', 'patras': 'GR',
|
|
// Bulgarian (transliterated)
|
|
'plovdiv': 'BG', 'пловдив': 'BG', 'варна': 'BG',
|
|
// Vietnamese with diacritics
|
|
'sài gòn': 'VN', 'hồ chí minh': 'VN', 'đà nẵng': 'VN',
|
|
// Moldovan
|
|
'chişinău': 'MD',
|
|
// Hungarian
|
|
'ferenciek': 'HU', 'debrecen': 'HU', 'szeged': 'HU', 'pécs': 'HU',
|
|
// Polish cities
|
|
'kalisz': 'PL', 'gdańsk': 'PL', 'wrocław': 'PL', 'poznań': 'PL', 'łódź': 'PL',
|
|
'katowice': 'PL', 'lublin': 'PL', 'szczecin': 'PL',
|
|
// Bermuda
|
|
'warwick': 'BM',
|
|
// Maltese
|
|
'sliema': 'MT', 'valletta': 'MT',
|
|
};
|
|
|
|
// Postal code / state code patterns
|
|
const POSTAL_PATTERNS: Array<{ pattern: RegExp; country: string }> = [
|
|
{ pattern: /\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b/, country: 'GB' }, // UK postcode
|
|
{ pattern: /\b[A-Z]\d{2}\s*[A-Z0-9]{4}\b/, country: 'IE' }, // Irish Eircode
|
|
{ pattern: /\bCittà del Vaticano\b/i, country: 'VA' },
|
|
{ pattern: /\b\d{3}\s*\d{3}\b/, country: 'IN' }, // Indian 6-digit with optional space
|
|
{ pattern: /\bNSW\s+\d{4}\b/, country: 'AU' }, // Australian state codes
|
|
{ pattern: /\bVIC\s+\d{4}\b/, country: 'AU' },
|
|
{ pattern: /\bQLD\s+\d{4}\b/, country: 'AU' },
|
|
{ pattern: /\bSA\s+\d{4}\b/, country: 'AU' },
|
|
{ pattern: /\bWA\s+\d{4}\b/, country: 'AU' },
|
|
{ pattern: /\bTAS\s+\d{4}\b/, country: 'AU' },
|
|
{ pattern: /\bACT\s+\d{4}\b/, country: 'AU' },
|
|
{ pattern: /\bNT\s+\d{4}\b/, country: 'AU' },
|
|
{ pattern: /\bA-\d{4}\b/, country: 'AT' }, // Austrian postal prefix
|
|
{ pattern: /\b, PR,?\s*\d{5}\b/, country: 'PR' }, // Puerto Rico
|
|
{ pattern: /\b\d{2}-\d{3}\b/, country: 'PL' }, // Polish postal code (XX-XXX)
|
|
// US state abbreviation + ZIP (e.g., "NY, 11201" or "NY 11201")
|
|
{ pattern: /\b(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY)[,\s]+\d{5}\b/, country: 'US' },
|
|
];
|
|
|
|
// ─── Types ───────────────────────────────────────────────────────────────────
|
|
|
|
interface ParsedChurch {
|
|
churchId: string; // weekdaymasses numeric ID
|
|
name: string;
|
|
latitude: number;
|
|
longitude: number;
|
|
address: string | null;
|
|
phone: string | null;
|
|
website: string | null;
|
|
country: string;
|
|
schedules: ParsedSchedule[];
|
|
}
|
|
|
|
interface ParsedSchedule {
|
|
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
|
|
time: string; // "07:00", "18:30"
|
|
language: string;
|
|
notes: string | null;
|
|
}
|
|
|
|
interface ImportStats {
|
|
churchesParsed: number;
|
|
churchesMatched: number;
|
|
churchesCreated: number;
|
|
churchesSkipped: number;
|
|
massSchedulesCreated: number;
|
|
errors: number;
|
|
}
|
|
|
|
interface CLIArgs {
|
|
all: boolean;
|
|
area?: string;
|
|
dryRun: boolean;
|
|
resumeFrom: number;
|
|
jobId?: string;
|
|
}
|
|
|
|
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
|
|
|
function parseArgs(): CLIArgs {
|
|
const args = process.argv.slice(2);
|
|
const result: CLIArgs = { all: false, dryRun: false, resumeFrom: 0 };
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
switch (args[i]) {
|
|
case '--all': result.all = true; break;
|
|
case '--area': result.area = args[++i]; break;
|
|
case '--dry-run': result.dryRun = true; break;
|
|
case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break;
|
|
case '--job-id': result.jobId = args[++i]; break;
|
|
case '--help':
|
|
console.log(`Usage: npx tsx scripts/import-weekdaymasses.ts [options]
|
|
--all Import all 3 area pages (gb, ireland, outside-gb)
|
|
--area <name> Import specific area (gb, ireland, outside-gb)
|
|
--dry-run No database writes
|
|
--resume-from <n> Skip first N churches
|
|
--job-id <uuid> Background job tracking`);
|
|
process.exit(0);
|
|
}
|
|
}
|
|
|
|
if (!result.all && !result.area) {
|
|
console.error('Error: specify --all or --area <name>');
|
|
process.exit(1);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// ─── HTTP ────────────────────────────────────────────────────────────────────
|
|
|
|
async function fetchPage(url: string): Promise<string | null> {
|
|
try {
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
'User-Agent': USER_AGENT,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
},
|
|
});
|
|
if (!response.ok) {
|
|
console.error(` HTTP ${response.status} for ${url}`);
|
|
return null;
|
|
}
|
|
return await response.text();
|
|
} catch (error) {
|
|
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// ─── HTML Parsing ────────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Extract all church divs from an area page HTML.
|
|
* Each church is a `<div class="church" id="pNNNNN">...</div>`.
|
|
*/
|
|
function extractChurchBlocks(html: string): string[] {
|
|
const blocks: string[] = [];
|
|
const regex = /<div\s+class="church"\s+id="p(\d+)">/g;
|
|
let match: RegExpExecArray | null;
|
|
|
|
while ((match = regex.exec(html)) !== null) {
|
|
const start = match.index;
|
|
// Find the closing </div> at the right nesting level
|
|
let depth = 1;
|
|
let pos = start + match[0].length;
|
|
while (depth > 0 && pos < html.length) {
|
|
const nextOpen = html.indexOf('<div', pos);
|
|
const nextClose = html.indexOf('</div>', pos);
|
|
|
|
if (nextClose === -1) break;
|
|
if (nextOpen !== -1 && nextOpen < nextClose) {
|
|
depth++;
|
|
pos = nextOpen + 4;
|
|
} else {
|
|
depth--;
|
|
pos = nextClose + 6;
|
|
}
|
|
}
|
|
blocks.push(html.substring(start, pos));
|
|
}
|
|
|
|
return blocks;
|
|
}
|
|
|
|
/**
|
|
* Parse a single church block HTML into structured data.
|
|
*/
|
|
function parseChurchBlock(html: string, defaultCountry: string): ParsedChurch | null {
|
|
// Church ID from div id="pNNNNN"
|
|
const idMatch = html.match(/id="p(\d+)"/);
|
|
if (!idMatch) return null;
|
|
const churchId = idMatch[1];
|
|
|
|
// Name from h3
|
|
const nameMatch = html.match(/<h3>(.*?)<\/h3>/s);
|
|
if (!nameMatch) return null;
|
|
const name = decodeHtmlEntities(nameMatch[1].trim());
|
|
|
|
// Coordinates from map link
|
|
let latitude = 0;
|
|
let longitude = 0;
|
|
const mapMatch = html.match(/lat=(-?[\d.]+)&(?:amp;)?lon=(-?[\d.]+)/);
|
|
if (mapMatch) {
|
|
latitude = parseFloat(mapMatch[1]);
|
|
longitude = parseFloat(mapMatch[2]);
|
|
}
|
|
|
|
// Address from p.address — text after the <br> tag
|
|
let address: string | null = null;
|
|
const addressMatch = html.match(/<p\s+class="address">([\s\S]*?)<\/p>/);
|
|
if (addressMatch) {
|
|
const addressHtml = addressMatch[1];
|
|
// Get text after last <br> (or after Streetview link)
|
|
const brIdx = addressHtml.lastIndexOf('<br');
|
|
if (brIdx !== -1) {
|
|
const afterBr = addressHtml.substring(brIdx);
|
|
const textAfterTag = afterBr.replace(/<br\s*\/?>/, '').trim();
|
|
address = stripHtmlTags(textAfterTag).trim() || null;
|
|
}
|
|
}
|
|
|
|
// Phone from p.telephone
|
|
let phone: string | null = null;
|
|
const phoneMatch = html.match(/<p\s+class="telephone">[\s\S]*?Tel:<\/span>\s*(.*?)<\/p>/);
|
|
if (phoneMatch) {
|
|
phone = phoneMatch[1].trim() || null;
|
|
}
|
|
|
|
// Website from p.transport with "Link to church website:"
|
|
let website: string | null = null;
|
|
const websiteMatch = html.match(/Link to church website:<\/span>\s*<a[^>]+href="([^"]+)"/);
|
|
if (websiteMatch) {
|
|
website = websiteMatch[1];
|
|
}
|
|
|
|
// Country detection
|
|
let country = defaultCountry;
|
|
if (!country && address) {
|
|
country = detectCountry(address);
|
|
}
|
|
if (!country) country = 'XX'; // Unknown
|
|
|
|
// Mass schedules from p.times
|
|
const schedules = parseScheduleBlocks(html);
|
|
|
|
return { churchId, name, latitude, longitude, address, phone, website, country, schedules };
|
|
}
|
|
|
|
/**
|
|
* Detect country from address text using three strategies:
|
|
* 1. Country name anywhere in address
|
|
* 2. City/region name matching
|
|
* 3. Postal code / state code patterns
|
|
*/
|
|
function detectCountry(address: string): string {
|
|
// Clean address for matching
|
|
const cleaned = address.replace(/\r?\n/g, ' ').trim();
|
|
const lower = cleaned.toLowerCase();
|
|
|
|
// 1. Country name match (check longer names first to avoid "India" matching in "Indiana")
|
|
const sortedCountries = Object.entries(COUNTRY_NAME_MAP)
|
|
.sort((a, b) => b[0].length - a[0].length);
|
|
for (const [name, code] of sortedCountries) {
|
|
if (lower.includes(name)) return code;
|
|
}
|
|
|
|
// 2. City/region match
|
|
for (const [city, code] of Object.entries(CITY_COUNTRY_MAP)) {
|
|
// Use word boundary-like check to avoid partial matches
|
|
const idx = lower.indexOf(city);
|
|
if (idx !== -1) {
|
|
const before = idx > 0 ? lower[idx - 1] : ' ';
|
|
const after = idx + city.length < lower.length ? lower[idx + city.length] : ' ';
|
|
if (/[\s,.(]/.test(before) || idx === 0) {
|
|
if (/[\s,.):\r\n]/.test(after) || idx + city.length === lower.length) {
|
|
return code;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. US state detection — match "State, ZIPCODE" or "State ZIPCODE" patterns
|
|
// to avoid false positives from country names like "Georgia"
|
|
const US_STATES = [
|
|
'alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado',
|
|
'connecticut', 'delaware', 'florida', 'hawaii', 'idaho',
|
|
'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana',
|
|
'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota',
|
|
'mississippi', 'missouri', 'montana', 'nebraska', 'nevada',
|
|
'new hampshire', 'new jersey', 'new mexico', 'new york', 'north carolina',
|
|
'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania',
|
|
'rhode island', 'south carolina', 'south dakota', 'tennessee', 'texas',
|
|
'utah', 'vermont', 'virginia', 'washington', 'west virginia',
|
|
'wisconsin', 'wyoming', 'georgia', // Georgia last — also a country, but with ZIP it's US
|
|
];
|
|
for (const state of US_STATES) {
|
|
const idx = lower.indexOf(state);
|
|
if (idx !== -1) {
|
|
// Check if followed by comma+ZIP or just ZIP (US address pattern)
|
|
const after = cleaned.substring(idx + state.length);
|
|
if (/^[,\s]+\d{5}/.test(after)) return 'US';
|
|
// Also match state abbreviation patterns like "FL 34747" — 2-letter code + ZIP
|
|
if (state !== 'georgia' && /^[,\s]*$/.test(after)) return 'US';
|
|
}
|
|
}
|
|
|
|
// 4. Postal code patterns
|
|
for (const { pattern, country } of POSTAL_PATTERNS) {
|
|
if (pattern.test(cleaned)) return country;
|
|
}
|
|
|
|
return '';
|
|
}
|
|
|
|
/**
|
|
* Parse all <p class="times"> blocks into schedule entries.
|
|
*/
|
|
function parseScheduleBlocks(html: string): ParsedSchedule[] {
|
|
const schedules: ParsedSchedule[] = [];
|
|
const timesRegex = /<p\s+class="times">([\s\S]*?)<\/p>/g;
|
|
let match: RegExpExecArray | null;
|
|
|
|
while ((match = timesRegex.exec(html)) !== null) {
|
|
const text = stripHtmlTags(match[1])
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
|
|
const parsed = parseTimesLine(text);
|
|
schedules.push(...parsed);
|
|
}
|
|
|
|
return schedules;
|
|
}
|
|
|
|
/**
|
|
* Parse a single mass times line like:
|
|
* "Sunday: 6.30am(Tamil), 8.30am(Tamil), 5.30pm(English)"
|
|
* "Weekday: Monday, Tuesday, Wednesday 6.15am"
|
|
* "Mon Tue Wed Thu Fri: 6.30am(Tamil)"
|
|
*/
|
|
function parseTimesLine(text: string): ParsedSchedule[] {
|
|
const schedules: ParsedSchedule[] = [];
|
|
|
|
// Split on colon — left side is days, right side is times
|
|
const colonIdx = text.indexOf(':');
|
|
if (colonIdx === -1) return schedules;
|
|
|
|
const dayPart = text.substring(0, colonIdx).trim();
|
|
const timePart = text.substring(colonIdx + 1).trim();
|
|
|
|
// Parse default day numbers from the day label
|
|
const defaultDays = parseDayLabel(dayPart);
|
|
|
|
// Split on semicolons to handle "Monday 10.00am; Thursday 7.30pm" patterns
|
|
const parts = timePart.split(';').map(p => p.trim()).filter(Boolean);
|
|
|
|
for (const part of parts) {
|
|
// Check if specific day names appear at the start of this part
|
|
const { specificDays, cleanedTimePart } = extractSpecificDays(part);
|
|
const days = specificDays.length > 0 ? specificDays : defaultDays;
|
|
|
|
if (days.length === 0) continue;
|
|
|
|
const timeStr = specificDays.length > 0 ? cleanedTimePart : part;
|
|
|
|
// Extract individual time entries: "7.00am(Tamil), 8.30am(English), ..."
|
|
const timeEntries = extractTimeEntries(timeStr);
|
|
|
|
for (const entry of timeEntries) {
|
|
const time24 = convertTo24h(entry.time);
|
|
if (!time24) continue;
|
|
|
|
for (const day of days) {
|
|
schedules.push({
|
|
dayOfWeek: day,
|
|
time: time24,
|
|
language: entry.language,
|
|
notes: entry.notes,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return schedules;
|
|
}
|
|
|
|
// Day name mappings
|
|
const DAY_MAP: Record<string, number> = {
|
|
'sunday': 0, 'sun': 0,
|
|
'monday': 1, 'mon': 1,
|
|
'tuesday': 2, 'tue': 2,
|
|
'wednesday': 3, 'wed': 3,
|
|
'thursday': 4, 'thu': 4,
|
|
'friday': 5, 'fri': 5,
|
|
'saturday': 6, 'sat': 6,
|
|
};
|
|
|
|
/**
|
|
* Parse day label (left of colon) into day numbers.
|
|
*/
|
|
function parseDayLabel(label: string): number[] {
|
|
const lower = label.toLowerCase().trim();
|
|
|
|
// "Weekday" = Mon-Fri
|
|
if (lower === 'weekday' || lower === 'weekdays') {
|
|
return [1, 2, 3, 4, 5];
|
|
}
|
|
|
|
// "Holy Day" or "Holyday"
|
|
if (lower.includes('holy day') || lower.includes('holyday')) {
|
|
return []; // Skip holy days — not a regular schedule
|
|
}
|
|
|
|
// Try to parse individual day names from the label
|
|
// e.g., "Mon Tue Wed Thu Fri" or "Monday Tuesday"
|
|
const days: number[] = [];
|
|
const words = lower.split(/[\s,]+/);
|
|
for (const word of words) {
|
|
const dayNum = DAY_MAP[word];
|
|
if (dayNum !== undefined) {
|
|
days.push(dayNum);
|
|
}
|
|
}
|
|
|
|
return days;
|
|
}
|
|
|
|
/**
|
|
* Check if the time part starts with specific day names.
|
|
* e.g., "Monday, Tuesday, Wednesday 6.15am" -> days=[1,2,3], cleaned="6.15am"
|
|
*/
|
|
function extractSpecificDays(timePart: string): { specificDays: number[]; cleanedTimePart: string } {
|
|
const days: number[] = [];
|
|
let remaining = timePart;
|
|
|
|
// Match day names at the start, separated by commas/spaces
|
|
const dayPattern = /^((?:(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)[,\s]*)+)/i;
|
|
const match = remaining.match(dayPattern);
|
|
|
|
if (match) {
|
|
const dayStr = match[1];
|
|
const words = dayStr.split(/[\s,]+/).filter(Boolean);
|
|
|
|
let allAreDays = true;
|
|
for (const word of words) {
|
|
const lower = word.toLowerCase();
|
|
if (DAY_MAP[lower] !== undefined) {
|
|
days.push(DAY_MAP[lower]);
|
|
} else {
|
|
allAreDays = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (allAreDays && days.length > 0) {
|
|
remaining = remaining.substring(match[0].length).trim();
|
|
} else {
|
|
days.length = 0; // Reset if not all words were days
|
|
}
|
|
}
|
|
|
|
return { specificDays: days, cleanedTimePart: remaining };
|
|
}
|
|
|
|
interface TimeEntry {
|
|
time: string; // Raw time: "7.00am", "6.30pm"
|
|
language: string;
|
|
notes: string | null;
|
|
}
|
|
|
|
/**
|
|
* Extract time entries from a times string.
|
|
* e.g., "7.00am(Tamil), 8.30am(English), 12.00pm" -> [{time: "7.00am", language: "Tamil"}, ...]
|
|
*/
|
|
function extractTimeEntries(text: string): TimeEntry[] {
|
|
const entries: TimeEntry[] = [];
|
|
// Match time patterns: digits.digitsam/pm optionally followed by (annotation)
|
|
const pattern = /(\d{1,2}\.\d{2}\s*(?:am|pm))(?:\s*\(([^)]*)\))?/gi;
|
|
let match: RegExpExecArray | null;
|
|
|
|
while ((match = pattern.exec(text)) !== null) {
|
|
const rawTime = match[1].replace(/\s/g, '');
|
|
const annotation = match[2]?.trim() || null;
|
|
|
|
let language = 'English';
|
|
let notes: string | null = null;
|
|
|
|
if (annotation) {
|
|
if (KNOWN_LANGUAGES.has(annotation.toLowerCase())) {
|
|
language = annotation.charAt(0).toUpperCase() + annotation.slice(1).toLowerCase();
|
|
} else {
|
|
notes = annotation;
|
|
}
|
|
}
|
|
|
|
entries.push({ time: rawTime, language, notes });
|
|
}
|
|
|
|
return entries;
|
|
}
|
|
|
|
/**
|
|
* Convert time from "H.MMam/pm" format to "HH:MM" 24h format.
|
|
*/
|
|
function convertTo24h(time: string): string | null {
|
|
const match = time.match(/^(\d{1,2})\.(\d{2})(am|pm)$/i);
|
|
if (!match) return null;
|
|
|
|
let hours = parseInt(match[1], 10);
|
|
const mins = parseInt(match[2], 10);
|
|
const period = match[3].toLowerCase();
|
|
|
|
if (mins < 0 || mins > 59) return null;
|
|
|
|
if (period === 'am') {
|
|
if (hours === 12) hours = 0;
|
|
} else {
|
|
if (hours !== 12) hours += 12;
|
|
}
|
|
|
|
if (hours < 0 || hours > 23) return null;
|
|
|
|
return `${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`;
|
|
}
|
|
|
|
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
|
|
function stripHtmlTags(html: string): string {
|
|
return html.replace(/<[^>]+>/g, '');
|
|
}
|
|
|
|
function decodeHtmlEntities(text: string): string {
|
|
return text
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/'/g, "'")
|
|
.replace(/'/g, "'")
|
|
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)));
|
|
}
|
|
|
|
// ─── Job Management ─────────────────────────────────────────────────────────
|
|
|
|
async function createOrResumeJob(jobId?: string): Promise<string | null> {
|
|
if (jobId) {
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: { status: 'running', startedAt: new Date() },
|
|
});
|
|
return jobId;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function completeJob(jobId: string | null, error?: string): Promise<void> {
|
|
if (!jobId) return;
|
|
try {
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: {
|
|
status: error ? 'failed' : 'completed',
|
|
error: error || null,
|
|
completedAt: new Date(),
|
|
},
|
|
});
|
|
} catch (err) {
|
|
console.error(`Failed to update job ${jobId}:`, err);
|
|
}
|
|
}
|
|
|
|
async function updateJobProgress(jobId: string | null, stats: ImportStats, total: number, processed: number): Promise<void> {
|
|
if (!jobId) return;
|
|
try {
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: {
|
|
totalItems: total,
|
|
processed,
|
|
succeeded: stats.churchesMatched + stats.churchesCreated,
|
|
failed: stats.errors,
|
|
itemsFound: stats.churchesParsed,
|
|
},
|
|
});
|
|
} catch (err) {
|
|
console.error(`Failed to update job progress:`, err);
|
|
}
|
|
}
|
|
|
|
// ─── Database ───────────────────────────────────────────────────────────────
|
|
|
|
async function loadExistingChurches(): Promise<ExistingChurch[]> {
|
|
const churches = await prisma.church.findMany({
|
|
where: {
|
|
OR: [
|
|
{ weekdayMassesId: { not: null } },
|
|
{ country: { in: ['GB', 'IE', 'IN', 'LK', 'FR', 'IT', 'VA', 'PT', 'ES', 'KR', 'JP', 'PH', 'SG', 'MY', 'HK'] } },
|
|
],
|
|
},
|
|
select: {
|
|
id: true,
|
|
name: true,
|
|
latitude: true,
|
|
longitude: true,
|
|
osmId: true,
|
|
baiduId: true,
|
|
masstimesId: true,
|
|
orarimesseId: true,
|
|
massSchedulesPhId: true,
|
|
philmassId: true,
|
|
horariosMisasId: true,
|
|
mszeInfoId: true,
|
|
weekdayMassesId: true,
|
|
messesInfoId: true,
|
|
bohosluzbyId: true,
|
|
miserendId: true,
|
|
kerknetId: true,
|
|
gottesdienstzeitenId: true,
|
|
discovermassId: true,
|
|
source: true,
|
|
website: true,
|
|
phone: true,
|
|
address: true,
|
|
country: true,
|
|
},
|
|
});
|
|
return churches;
|
|
}
|
|
|
|
// ─── Main Import ────────────────────────────────────────────────────────────
|
|
|
|
async function importAreaBlocks(
|
|
areaName: string,
|
|
config: { url: string; defaultCountry: string },
|
|
blocks: string[],
|
|
existingChurches: ExistingChurch[],
|
|
stats: ImportStats,
|
|
dryRun: boolean,
|
|
resumeFrom: number,
|
|
jobId: string | null,
|
|
globalProcessed: number,
|
|
globalTotal: number,
|
|
): Promise<number> {
|
|
console.log(`\nProcessing ${areaName}: ${blocks.length} churches`);
|
|
|
|
const startTime = Date.now();
|
|
|
|
for (let i = 0; i < blocks.length; i++) {
|
|
const absoluteIndex = globalProcessed + i;
|
|
if (absoluteIndex < resumeFrom) continue;
|
|
|
|
const church = parseChurchBlock(blocks[i], config.defaultCountry);
|
|
if (!church) {
|
|
stats.errors++;
|
|
continue;
|
|
}
|
|
stats.churchesParsed++;
|
|
|
|
if (dryRun) {
|
|
if (stats.churchesParsed <= 20) {
|
|
const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
|
|
console.log(` [${areaName}] ${church.name} (${church.country}) — ${church.schedules.length} schedules, coords: ${church.latitude.toFixed(4)}, ${church.longitude.toFixed(4)} [${elapsed}s]`);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
const candidate = {
|
|
name: church.name,
|
|
lat: church.latitude,
|
|
lng: church.longitude,
|
|
weekdayMassesId: church.churchId,
|
|
};
|
|
|
|
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
|
|
|
if (duplicate) {
|
|
// Update existing church
|
|
const updateData: Record<string, unknown> = {
|
|
weekdayMassesId: church.churchId,
|
|
lastScrapedAt: new Date(),
|
|
};
|
|
// Only fill in missing fields
|
|
if (!duplicate.phone && church.phone) updateData.phone = church.phone;
|
|
if (!duplicate.website && church.website) {
|
|
updateData.website = church.website;
|
|
updateData.hasWebsite = true;
|
|
}
|
|
if (!duplicate.address && church.address) updateData.address = church.address;
|
|
// Update country if existing is unknown (XX) and we detected a real one
|
|
if (duplicate.country === 'XX' && church.country !== 'XX') {
|
|
updateData.country = church.country;
|
|
}
|
|
|
|
try {
|
|
await prisma.church.update({
|
|
where: { id: duplicate.id },
|
|
data: updateData,
|
|
});
|
|
} catch (error) {
|
|
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
|
stats.churchesSkipped++;
|
|
continue;
|
|
}
|
|
throw error;
|
|
}
|
|
|
|
// Replace mass schedules if we have new ones
|
|
if (church.schedules.length > 0) {
|
|
await prisma.$transaction(async (tx) => {
|
|
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
|
await tx.massSchedule.createMany({
|
|
data: church.schedules.map((s) => ({
|
|
churchId: duplicate.id,
|
|
dayOfWeek: s.dayOfWeek,
|
|
time: s.time,
|
|
language: s.language,
|
|
notes: s.notes,
|
|
})),
|
|
});
|
|
});
|
|
stats.massSchedulesCreated += church.schedules.length;
|
|
}
|
|
|
|
stats.churchesMatched++;
|
|
} else {
|
|
// Create new church
|
|
try {
|
|
const newChurch = await prisma.church.create({
|
|
data: {
|
|
name: church.name,
|
|
latitude: church.latitude,
|
|
longitude: church.longitude,
|
|
address: church.address,
|
|
country: church.country,
|
|
phone: church.phone,
|
|
website: church.website,
|
|
hasWebsite: !!church.website,
|
|
weekdayMassesId: church.churchId,
|
|
source: 'weekdaymasses',
|
|
lastScrapedAt: church.schedules.length > 0 ? new Date() : null,
|
|
},
|
|
});
|
|
|
|
// Create mass schedules
|
|
if (church.schedules.length > 0) {
|
|
await prisma.massSchedule.createMany({
|
|
data: church.schedules.map((s) => ({
|
|
churchId: newChurch.id,
|
|
dayOfWeek: s.dayOfWeek,
|
|
time: s.time,
|
|
language: s.language,
|
|
notes: s.notes,
|
|
})),
|
|
});
|
|
stats.massSchedulesCreated += church.schedules.length;
|
|
}
|
|
|
|
// Add to in-memory dedup list
|
|
existingChurches.push({
|
|
id: newChurch.id,
|
|
name: church.name,
|
|
latitude: church.latitude,
|
|
longitude: church.longitude,
|
|
osmId: null,
|
|
baiduId: null,
|
|
masstimesId: null,
|
|
orarimesseId: null,
|
|
massSchedulesPhId: null,
|
|
philmassId: null,
|
|
horariosMisasId: null,
|
|
mszeInfoId: null,
|
|
weekdayMassesId: church.churchId,
|
|
messesInfoId: null,
|
|
bohosluzbyId: null,
|
|
miserendId: null,
|
|
kerknetId: null,
|
|
gottesdienstzeitenId: null,
|
|
discovermassId: null,
|
|
source: 'weekdaymasses',
|
|
website: church.website,
|
|
phone: church.phone,
|
|
address: church.address,
|
|
country: church.country,
|
|
});
|
|
|
|
stats.churchesCreated++;
|
|
} catch (error) {
|
|
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
|
stats.churchesSkipped++;
|
|
continue;
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.error(` Error processing ${church.name} (${church.churchId}): ${error instanceof Error ? error.message : error}`);
|
|
stats.errors++;
|
|
}
|
|
|
|
// Progress logging
|
|
const totalProcessed = absoluteIndex + 1;
|
|
if (totalProcessed % 500 === 0) {
|
|
const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
|
|
console.log(` Progress: ${totalProcessed}/${globalTotal} [${elapsed}s]`);
|
|
await updateJobProgress(jobId, stats, globalTotal, totalProcessed);
|
|
}
|
|
}
|
|
|
|
return globalProcessed + blocks.length;
|
|
}
|
|
|
|
// ─── Main ───────────────────────────────────────────────────────────────────
|
|
|
|
async function main() {
|
|
const args = parseArgs();
|
|
const jobId = await createOrResumeJob(args.jobId);
|
|
|
|
console.log(`\n${'='.repeat(70)}`);
|
|
console.log('WEEKDAYMASSES.ORG.UK IMPORTER');
|
|
console.log('='.repeat(70));
|
|
console.log(`Mode: ${args.all ? 'All areas' : `Area: ${args.area}`}`);
|
|
console.log(`Dry run: ${args.dryRun ? 'YES' : 'NO'}`);
|
|
if (args.resumeFrom > 0) console.log(`Resume from: ${args.resumeFrom}`);
|
|
console.log(`Time: ${new Date().toISOString()}`);
|
|
console.log('='.repeat(70));
|
|
|
|
const stats: ImportStats = {
|
|
churchesParsed: 0,
|
|
churchesMatched: 0,
|
|
churchesCreated: 0,
|
|
churchesSkipped: 0,
|
|
massSchedulesCreated: 0,
|
|
errors: 0,
|
|
};
|
|
|
|
// Determine which areas to import
|
|
const areas: Array<[string, { url: string; defaultCountry: string }]> = [];
|
|
if (args.all) {
|
|
areas.push(...Object.entries(AREA_PAGES));
|
|
} else if (args.area) {
|
|
const config = AREA_PAGES[args.area];
|
|
if (!config) {
|
|
console.error(`Unknown area: ${args.area}. Valid: ${Object.keys(AREA_PAGES).join(', ')}`);
|
|
process.exit(1);
|
|
}
|
|
areas.push([args.area, config]);
|
|
}
|
|
|
|
// Load existing churches for deduplication
|
|
if (!args.dryRun) {
|
|
console.log('\nLoading existing churches for deduplication...');
|
|
}
|
|
const existingChurches = args.dryRun ? [] : await loadExistingChurches();
|
|
if (!args.dryRun) {
|
|
console.log(`Loaded ${existingChurches.length} existing churches`);
|
|
}
|
|
|
|
// Pre-fetch all area pages to get accurate total count for progress tracking
|
|
console.log('\nFetching area pages...');
|
|
const fetchedAreas: Array<{ name: string; config: { url: string; defaultCountry: string }; blocks: string[] }> = [];
|
|
let globalTotal = 0;
|
|
for (const [areaName, config] of areas) {
|
|
console.log(` Fetching ${areaName}: ${SITE_BASE}${config.url}`);
|
|
const html = await fetchPage(`${SITE_BASE}${config.url}`);
|
|
if (!html) {
|
|
console.error(` Failed to fetch ${areaName} page`);
|
|
continue;
|
|
}
|
|
console.log(` Page size: ${(html.length / 1024 / 1024).toFixed(1)} MB`);
|
|
const blocks = extractChurchBlocks(html);
|
|
console.log(` Found ${blocks.length} church blocks`);
|
|
globalTotal += blocks.length;
|
|
fetchedAreas.push({ name: areaName, config, blocks });
|
|
}
|
|
console.log(`\nTotal churches across all areas: ${globalTotal}`);
|
|
|
|
let globalProcessed = 0;
|
|
|
|
try {
|
|
for (const { name: areaName, config, blocks } of fetchedAreas) {
|
|
globalProcessed = await importAreaBlocks(
|
|
areaName, config, blocks, existingChurches, stats,
|
|
args.dryRun, args.resumeFrom, jobId,
|
|
globalProcessed, globalTotal,
|
|
);
|
|
}
|
|
|
|
// Print summary
|
|
console.log(`\n${'='.repeat(70)}`);
|
|
console.log(`WEEKDAYMASSES IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
|
console.log('='.repeat(70));
|
|
console.log(`Churches parsed: ${stats.churchesParsed}`);
|
|
if (!args.dryRun) {
|
|
console.log(`Churches matched: ${stats.churchesMatched}`);
|
|
console.log(`Churches created: ${stats.churchesCreated}`);
|
|
console.log(`Churches skipped: ${stats.churchesSkipped} (duplicates)`);
|
|
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
|
|
}
|
|
if (stats.errors > 0) {
|
|
console.log(`Errors: ${stats.errors}`);
|
|
}
|
|
console.log('='.repeat(70));
|
|
|
|
await completeJob(jobId);
|
|
} catch (error) {
|
|
console.error('Fatal error:', error);
|
|
await completeJob(jobId, String(error));
|
|
process.exit(1);
|
|
} finally {
|
|
await prisma.$disconnect();
|
|
}
|
|
}
|
|
|
|
main();
|