#!/usr/bin/env tsx /** * Import Catholic churches and mass schedules from msze.info (Poland) * * msze.info is a Polish directory of Catholic parishes with mass schedules. * The site uses numbered sitemaps (Churches1.xml through Churches11.xml) * with ~500 URLs each, containing both /kosciol/{id} (church pages) and * /msze-online/{slug} (livestream pages). * * Import strategy: * 1. Fetch all 11 sitemaps → extract /kosciol/{id} URLs (skip /msze-online/) * 2. For each church: fetch HTML, parse name/address/phone/website/schedule * 3. Extract coordinates from embedded tomtom_codeAddress() JS call * 4. Match against existing PL churches, upsert * * Usage: * npx tsx scripts/import-msze-info.ts --all * npx tsx scripts/import-msze-info.ts --all --dry-run * npx tsx scripts/import-msze-info.ts --all --resume-from 500 * npx tsx scripts/import-msze-info.ts --all --job-id {uuid} */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); import { findDuplicateChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher'; // ─── Constants ─────────────────────────────────────────────────────────────── const SITE_BASE = 'https://www.msze.info'; const SITEMAP_COUNT = 11; const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; const REQUEST_DELAY_MS = 1500; // ─── Types ─────────────────────────────────────────────────────────────────── interface ParsedChurch { name: string; address: string | null; city: string | null; zip: string | null; phone: string | null; website: string | null; email: string | null; latitude: number; longitude: number; } interface ParsedSchedule { dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat time: string; // "05:00", "18:30" } interface ImportStats { churchesFound: number; churchesMatched: number; churchesCreated: number; churchesSkipped: number; schedulesProcessed: number; massSchedulesCreated: number; errors: number; } interface CLIArgs { all: boolean; dryRun: boolean; resumeFrom?: number; jobId?: string; } // ─── HTTP Client ───────────────────────────────────────────────────────────── let requestCount = 0; function delay(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise { if (requestCount > 0) { await delay(delayMs); } requestCount++; try { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, }); if (!response.ok) { console.error(` HTTP ${response.status} for ${url}`); return null; } return await response.text(); } catch (error) { console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); return null; } } // ─── Sitemap Parser ────────────────────────────────────────────────────────── async function fetchChurchUrlsFromSitemaps(): Promise { const allIds: string[] = []; const seen = new Set(); for (let i = 1; i <= SITEMAP_COUNT; i++) { const sitemapUrl = `${SITE_BASE}/sitemap/Churches${i}.xml`; console.log(` Fetching ${sitemapUrl}...`); const xml = await fetchPage(sitemapUrl); if (!xml) { console.error(` Failed to fetch ${sitemapUrl}`); continue; } // Extract /kosciol/{id} URLs, skip /msze-online/ const locRegex = /https?:\/\/(?:www\.)?msze\.info\/kosciol\/(\d+)<\/loc>/g; let match; while ((match = locRegex.exec(xml)) !== null) { const id = match[1]; if (!seen.has(id)) { seen.add(id); allIds.push(id); } } } // Sort numerically for deterministic order allIds.sort((a, b) => parseInt(a) - parseInt(b)); console.log(`Found ${allIds.length} unique church IDs from ${SITEMAP_COUNT} sitemaps`); return allIds; } // ─── HTML Parsers ──────────────────────────────────────────────────────────── function parseChurchPage(html: string): ParsedChurch { // Name: from

Church Name, City

const h1Match = html.match(/]*>([\s\S]*?)<\/h1>/i); let name = ''; let cityFromH1: string | null = null; if (h1Match) { const raw = h1Match[1].replace(/<[^>]+>/g, '').trim(); // Split "Church Name, City" — city is the last comma-separated part const lastComma = raw.lastIndexOf(','); if (lastComma > 0) { name = raw.substring(0, lastComma).trim(); cityFromH1 = raw.substring(lastComma + 1).trim(); } else { name = raw; } } // Address: look for "Adres:" or address-like patterns // Pattern: Adres: Street, City let address: string | null = null; let city: string | null = cityFromH1; let zip: string | null = null; const addressMatch = html.match(/Adres:<\/span>\s*(?:)?([\s\S]*?)(?:<\/strong>|]+>/g, '') .replace(/\s+/g, ' ') .trim() || null; } // Also try the tomtom_codeAddress first argument as fallback address if (!address) { const tomtomAddrMatch = html.match(/tomtom_codeAddress\s*\(\s*'([^']+)'/); if (tomtomAddrMatch) { address = tomtomAddrMatch[1].trim() || null; } } if (address) { // Extract Polish postal code (XX-XXX format) const zipMatch = address.match(/\b(\d{2}-\d{3})\b/); if (zipMatch) { zip = zipMatch[1]; } // Extract city from address if not already from h1 if (!city) { // City is typically the last part after comma const parts = address.split(','); if (parts.length > 1) { city = parts[parts.length - 1].replace(/\d{2}-\d{3}/, '').trim() || null; } } } // Coordinates: from tomtom_codeAddress('addr', zoom, 'name', null, lat, lng) let latitude = 0; let longitude = 0; const coordMatch = html.match( /tomtom_codeAddress\s*\([^,]+,\s*\d+\s*,\s*[^,]+,\s*(?:null|'[^']*')\s*,\s*(-?[\d.]+)\s*,\s*(-?[\d.]+)\s*\)/ ); if (coordMatch) { const lat = parseFloat(coordMatch[1]); const lng = parseFloat(coordMatch[2]); if (!isNaN(lat) && !isNaN(lng) && lat !== 0 && lng !== 0) { latitude = lat; longitude = lng; } } // Phone: let phone: string | null = null; const phoneMatch = html.match(/]*>[^<]*Witryna/i); if (websiteMatch) { website = websiteMatch[1].trim() || null; } // Also try: link text that looks like a URL (www.xxx) if (!website) { const wwwMatch = html.match(/]*>www\.[^<]+<\/a>/i); if (wwwMatch) { website = wwwMatch[1].trim() || null; } } // Email: not reliably available (Cloudflare-protected) const email: string | null = null; return { name, address, city, zip, phone, website, email, latitude, longitude }; } function parseMassSchedule(html: string): ParsedSchedule[] { const schedules: ParsedSchedule[] = []; const seen = new Set(); // Find mass schedule sections by h2/h3 headings containing "MSZE" // Pattern:

MSZE NIEDZIELE I ŚWIĘTA - Church Name

followed by "godz. ..." // Pattern:

MSZE DNI POWSZEDNIE - Church Name

followed by "godz. ..." const sectionRegex = /]*>([\s\S]*?)<\/h[2-4]>([\s\S]*?)(?=]+>/g, '').trim().toUpperCase(); const content = sectionMatch[2]; // Only process mass schedule headings (starts with "MSZE") if (!heading.startsWith('MSZE')) continue; // Determine which days this section covers const days = resolvePolishDays(heading); if (days.length === 0) continue; // Extract times from "godz." patterns const times = extractTimes(content); for (const day of days) { for (const time of times) { const key = `${day}:${time}`; if (seen.has(key)) continue; seen.add(key); schedules.push({ dayOfWeek: day, time }); } } } return schedules; } function resolvePolishDays(heading: string): number[] { const h = heading; // already uppercased by caller // "NIEDZIELE I ŚWIĘTA" or just "NIEDZIEL" → Sunday if (h.includes('NIEDZIEL')) { return [0]; } // "DNI POWSZEDNIE" → Weekdays (Mon-Sat) if (h.includes('DNI POWSZEDNIE') || h.includes('POWSZEDNI')) { return [1, 2, 3, 4, 5, 6]; } // Individual day names (rare but possible) if (h.includes('PONIEDZIA')) return [1]; // poniedziałek if (h.includes('WTOREK') || h.includes('WTORK')) return [2]; if (h.includes('ŚRODA') || h.includes('SRODA') || h.includes('ŚROD')) return [3]; if (h.includes('CZWARTEK') || h.includes('CZWART')) return [4]; if (h.includes('PIĄTEK') || h.includes('PIATEK') || h.includes('PIĄT')) return [5]; if (h.includes('SOBOT')) return [6]; return []; } function extractTimes(text: string): string[] { const times: string[] = []; // Match "godz." followed by times, or standalone HH:MM patterns // Handles: "godz. 6:30, 8:00, 9:30" and "godz. 7:00" const timeRegex = /(\d{1,2}):(\d{2})/g; let match; // Only look at text near "godz." patterns const godzSections = text.split(/godz\.\s*/i); for (let i = 1; i < godzSections.length; i++) { // Take text until the next section break (paragraph, div, heading) const section = godzSections[i].split(/<(?:p|div|br\s*\/?>|h[2-4])/i)[0]; while ((match = timeRegex.exec(section)) !== null) { const hours = parseInt(match[1]); const mins = parseInt(match[2]); if (hours >= 0 && hours <= 23 && mins >= 0 && mins <= 59) { times.push(`${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`); } } } return times; } // ─── Database Operations ───────────────────────────────────────────────────── async function loadExistingPolishChurches(): Promise { console.log('Loading existing Polish churches for deduplication...'); const churches = await prisma.church.findMany({ where: { country: 'PL' }, select: { id: true, name: true, latitude: true, longitude: true, osmId: true, baiduId: true, masstimesId: true, orarimesseId: true, massSchedulesPhId: true, philmassId: true, horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true, messesInfoId: true, bohosluzbyId: true, miserendId: true, kerknetId: true, gottesdienstzeitenId: true, source: true, website: true, phone: true, address: true, }, }); console.log(`Loaded ${churches.length} existing Polish churches`); return churches; } // ─── Import Logic ──────────────────────────────────────────────────────────── async function processChurch( churchId: string, existingChurches: ExistingChurch[], dryRun: boolean, stats: ImportStats, ): Promise { stats.churchesFound++; const url = `${SITE_BASE}/kosciol/${churchId}`; const churchHtml = await fetchPage(url); if (!churchHtml) { stats.errors++; return; } const parsed = parseChurchPage(churchHtml); if (!parsed.name) { console.log(` Skipping ${churchId}: no name found`); stats.churchesSkipped++; return; } const schedules = parseMassSchedule(churchHtml); // Build candidate for dedup const candidate = { name: parsed.name, lat: parsed.latitude, lng: parsed.longitude, mszeInfoId: churchId, }; const duplicate = findDuplicateChurch(candidate, existingChurches); if (dryRun) { if (duplicate) { stats.churchesMatched++; console.log(` [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`); } else { stats.churchesCreated++; console.log(` [NEW] "${parsed.name}" (${parsed.city || 'unknown city'})`); } if (schedules.length > 0) { stats.schedulesProcessed++; stats.massSchedulesCreated += schedules.length; } return; } if (duplicate) { // Update existing church stats.churchesMatched++; const updateData: Record = { mszeInfoId: churchId, }; if (!duplicate.address && parsed.address) updateData.address = parsed.address; if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone; if (!duplicate.website && parsed.website) { updateData.website = parsed.website; updateData.hasWebsite = true; } // Update coordinates if existing has none and we have them if (duplicate.latitude === 0 && duplicate.longitude === 0 && parsed.latitude !== 0) { updateData.latitude = parsed.latitude; updateData.longitude = parsed.longitude; } // Fill city/zip if not set const dbRecord = await prisma.church.findUnique({ where: { id: duplicate.id }, select: { city: true, zip: true, email: true }, }); if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city; if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip; if (dbRecord && !dbRecord.email && parsed.email) updateData.email = parsed.email; try { await prisma.church.update({ where: { id: duplicate.id }, data: updateData, }); } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; return; } throw error; } // Replace mass schedules if (schedules.length > 0) { try { await prisma.$transaction(async (tx) => { await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } }); await tx.massSchedule.createMany({ data: schedules.map((s) => ({ churchId: duplicate.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'Polish', })), }); await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() }, }); }); stats.schedulesProcessed++; stats.massSchedulesCreated += schedules.length; } catch (error) { stats.errors++; console.error(` Error saving schedules for ${churchId}: ${error instanceof Error ? error.message : error}`); } } } else { // Create new church try { const newChurch = await prisma.church.create({ data: { name: parsed.name, latitude: parsed.latitude, longitude: parsed.longitude, address: parsed.address, zip: parsed.zip, city: parsed.city, country: 'PL', phone: parsed.phone, website: parsed.website, email: parsed.email, hasWebsite: !!parsed.website, mszeInfoId: churchId, source: 'msze-info', }, }); stats.churchesCreated++; // Add to in-memory array for within-run dedup existingChurches.push({ id: newChurch.id, name: parsed.name, latitude: parsed.latitude, longitude: parsed.longitude, osmId: null, baiduId: null, masstimesId: null, orarimesseId: null, massSchedulesPhId: null, philmassId: null, horariosMisasId: null, mszeInfoId: churchId, weekdayMassesId: null, messesInfoId: null, bohosluzbyId: null, miserendId: null, kerknetId: null, gottesdienstzeitenId: null, source: 'msze-info', website: parsed.website, phone: parsed.phone, address: parsed.address, }); // Create mass schedules if (schedules.length > 0) { await prisma.massSchedule.createMany({ data: schedules.map((s) => ({ churchId: newChurch.id, dayOfWeek: s.dayOfWeek, time: s.time, language: 'Polish', })), }); await prisma.church.update({ where: { id: newChurch.id }, data: { lastScrapedAt: new Date() }, }); stats.schedulesProcessed++; stats.massSchedulesCreated += schedules.length; } } catch (error) { if (error instanceof Error && error.message.includes('Unique constraint')) { stats.churchesSkipped++; return; } throw error; } } } // ─── CLI ───────────────────────────────────────────────────────────────────── function parseArgs(): CLIArgs { const args = process.argv.slice(2); const result: CLIArgs = { all: false, dryRun: false, }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--all': result.all = true; break; case '--dry-run': result.dryRun = true; break; case '--resume-from': result.resumeFrom = parseInt(args[++i]); break; case '--job-id': result.jobId = args[++i]; break; case '--help': case '-h': console.log(` Usage: npx tsx scripts/import-msze-info.ts [options] Options: --all Import all churches from sitemaps --dry-run No database writes, just report what would happen --resume-from Skip first N churches --job-id Background job tracking ID --help, -h Show this help message Examples: npx tsx scripts/import-msze-info.ts --all --dry-run npx tsx scripts/import-msze-info.ts --all npx tsx scripts/import-msze-info.ts --all --resume-from 500 `); process.exit(0); } } if (!result.all) { console.error('Error: specify --all'); process.exit(1); } return result; } // ─── Helpers ───────────────────────────────────────────────────────────────── function formatDuration(ms: number): string { const seconds = Math.floor(ms / 1000); const minutes = Math.floor(seconds / 60); const hours = Math.floor(minutes / 60); if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`; if (minutes > 0) return `${minutes}m ${seconds % 60}s`; return `${seconds}s`; } // ─── Main ──────────────────────────────────────────────────────────────────── async function main() { const args = parseArgs(); const startTime = Date.now(); console.log('\n' + '='.repeat(70)); console.log('MSZE.INFO (POLAND) IMPORTER'); console.log('='.repeat(70)); console.log(`Mode: All churches from sitemaps`); console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`); if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`); console.log(`Time: ${new Date().toISOString()}`); console.log('='.repeat(70) + '\n'); // Update background job status if provided if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: 'running', startedAt: new Date() }, }); } catch { // Job might not exist yet } } const stats: ImportStats = { churchesFound: 0, churchesMatched: 0, churchesCreated: 0, churchesSkipped: 0, schedulesProcessed: 0, massSchedulesCreated: 0, errors: 0, }; // Load existing Polish churches for dedup const existingChurches = await loadExistingPolishChurches(); // Fetch church IDs from sitemaps console.log('Fetching church URLs from sitemaps...'); let churchIds = await fetchChurchUrlsFromSitemaps(); // Handle --resume-from if (args.resumeFrom) { const before = churchIds.length; churchIds = churchIds.slice(args.resumeFrom); console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchIds.length} churches)\n`); } else { console.log(`Processing ${churchIds.length} churches\n`); } // Process each church for (let i = 0; i < churchIds.length; i++) { const id = churchIds[i]; const elapsed = formatDuration(Date.now() - startTime); console.log(`[${i + 1}/${churchIds.length}] kosciol/${id} [${elapsed} elapsed]`); try { await processChurch(id, existingChurches, args.dryRun, stats); } catch (error) { stats.errors++; console.error(` ERROR processing ${id}: ${error instanceof Error ? error.message : error}`); } } // Print summary const totalTime = Date.now() - startTime; console.log('\n' + '='.repeat(70)); console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`); console.log('='.repeat(70)); console.log(`Churches found: ${stats.churchesFound}`); console.log(` Matched (existing): ${stats.churchesMatched}`); console.log(` Created (new): ${stats.churchesCreated}`); console.log(` Skipped: ${stats.churchesSkipped}`); console.log(`Schedules processed: ${stats.schedulesProcessed}`); console.log(`Mass schedules created: ${stats.massSchedulesCreated}`); console.log(`Errors: ${stats.errors}`); console.log(`Total time: ${formatDuration(totalTime)}`); console.log(`HTTP requests: ${requestCount}`); console.log('='.repeat(70) + '\n'); // Update background job if (args.jobId) { try { await prisma.backgroundJob.update({ where: { id: args.jobId }, data: { status: stats.errors > 0 ? 'completed_with_errors' : 'completed', completedAt: new Date(), result: JSON.stringify(stats), }, }); } catch { // Ignore } } } main() .catch((error) => { console.error('Fatal error:', error); process.exit(1); }) .finally(async () => { await prisma.$disconnect(); await pool.end(); });