745 lines
24 KiB
TypeScript
745 lines
24 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Import Catholic churches and mass schedules from msze.info (Poland)
|
||
|
|
*
|
||
|
|
* msze.info is a Polish directory of Catholic parishes with mass schedules.
|
||
|
|
* The site uses numbered sitemaps (Churches1.xml through Churches11.xml)
|
||
|
|
* with ~500 URLs each, containing both /kosciol/{id} (church pages) and
|
||
|
|
* /msze-online/{slug} (livestream pages).
|
||
|
|
*
|
||
|
|
* Import strategy:
|
||
|
|
* 1. Fetch all 11 sitemaps → extract /kosciol/{id} URLs (skip /msze-online/)
|
||
|
|
* 2. For each church: fetch HTML, parse name/address/phone/website/schedule
|
||
|
|
* 3. Extract coordinates from embedded tomtom_codeAddress() JS call
|
||
|
|
* 4. Match against existing PL churches, upsert
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* npx tsx scripts/import-msze-info.ts --all
|
||
|
|
* npx tsx scripts/import-msze-info.ts --all --dry-run
|
||
|
|
* npx tsx scripts/import-msze-info.ts --all --resume-from 500
|
||
|
|
* npx tsx scripts/import-msze-info.ts --all --job-id {uuid}
|
||
|
|
*/
|
||
|
|
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
import path from 'path';
|
||
|
|
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
|
||
|
|
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||
|
|
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||
|
|
const pool = new Pool({
|
||
|
|
connectionString: dbUrl,
|
||
|
|
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||
|
|
});
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||
|
|
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||
|
|
|
||
|
|
// ─── Constants ───────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
const SITE_BASE = 'https://www.msze.info';
|
||
|
|
const SITEMAP_COUNT = 11;
|
||
|
|
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||
|
|
const REQUEST_DELAY_MS = 1500;
|
||
|
|
|
||
|
|
// ─── Types ───────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
interface ParsedChurch {
|
||
|
|
name: string;
|
||
|
|
address: string | null;
|
||
|
|
city: string | null;
|
||
|
|
zip: string | null;
|
||
|
|
phone: string | null;
|
||
|
|
website: string | null;
|
||
|
|
email: string | null;
|
||
|
|
latitude: number;
|
||
|
|
longitude: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ParsedSchedule {
|
||
|
|
dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
|
||
|
|
time: string; // "05:00", "18:30"
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ImportStats {
|
||
|
|
churchesFound: number;
|
||
|
|
churchesMatched: number;
|
||
|
|
churchesCreated: number;
|
||
|
|
churchesSkipped: number;
|
||
|
|
schedulesProcessed: number;
|
||
|
|
massSchedulesCreated: number;
|
||
|
|
errors: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface CLIArgs {
|
||
|
|
all: boolean;
|
||
|
|
dryRun: boolean;
|
||
|
|
resumeFrom?: number;
|
||
|
|
jobId?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── HTTP Client ─────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
let requestCount = 0;
|
||
|
|
|
||
|
|
function delay(ms: number): Promise<void> {
|
||
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||
|
|
}
|
||
|
|
|
||
|
|
async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise<string | null> {
|
||
|
|
if (requestCount > 0) {
|
||
|
|
await delay(delayMs);
|
||
|
|
}
|
||
|
|
requestCount++;
|
||
|
|
|
||
|
|
try {
|
||
|
|
const response = await fetch(url, {
|
||
|
|
headers: {
|
||
|
|
'User-Agent': USER_AGENT,
|
||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
if (!response.ok) {
|
||
|
|
console.error(` HTTP ${response.status} for ${url}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
return await response.text();
|
||
|
|
} catch (error) {
|
||
|
|
console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Sitemap Parser ──────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function fetchChurchUrlsFromSitemaps(): Promise<string[]> {
|
||
|
|
const allIds: string[] = [];
|
||
|
|
const seen = new Set<string>();
|
||
|
|
|
||
|
|
for (let i = 1; i <= SITEMAP_COUNT; i++) {
|
||
|
|
const sitemapUrl = `${SITE_BASE}/sitemap/Churches${i}.xml`;
|
||
|
|
console.log(` Fetching ${sitemapUrl}...`);
|
||
|
|
const xml = await fetchPage(sitemapUrl);
|
||
|
|
if (!xml) {
|
||
|
|
console.error(` Failed to fetch ${sitemapUrl}`);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract /kosciol/{id} URLs, skip /msze-online/
|
||
|
|
const locRegex = /<loc>https?:\/\/(?:www\.)?msze\.info\/kosciol\/(\d+)<\/loc>/g;
|
||
|
|
let match;
|
||
|
|
while ((match = locRegex.exec(xml)) !== null) {
|
||
|
|
const id = match[1];
|
||
|
|
if (!seen.has(id)) {
|
||
|
|
seen.add(id);
|
||
|
|
allIds.push(id);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Sort numerically for deterministic order
|
||
|
|
allIds.sort((a, b) => parseInt(a) - parseInt(b));
|
||
|
|
|
||
|
|
console.log(`Found ${allIds.length} unique church IDs from ${SITEMAP_COUNT} sitemaps`);
|
||
|
|
return allIds;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── HTML Parsers ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function parseChurchPage(html: string): ParsedChurch {
|
||
|
|
// Name: from <h1>Church Name, City</h1>
|
||
|
|
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
||
|
|
let name = '';
|
||
|
|
let cityFromH1: string | null = null;
|
||
|
|
|
||
|
|
if (h1Match) {
|
||
|
|
const raw = h1Match[1].replace(/<[^>]+>/g, '').trim();
|
||
|
|
// Split "Church Name, City" — city is the last comma-separated part
|
||
|
|
const lastComma = raw.lastIndexOf(',');
|
||
|
|
if (lastComma > 0) {
|
||
|
|
name = raw.substring(0, lastComma).trim();
|
||
|
|
cityFromH1 = raw.substring(lastComma + 1).trim();
|
||
|
|
} else {
|
||
|
|
name = raw;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Address: look for "Adres:" or address-like patterns
|
||
|
|
// Pattern: <span class="highlight">Adres:</span> <strong>Street, City</strong>
|
||
|
|
let address: string | null = null;
|
||
|
|
let city: string | null = cityFromH1;
|
||
|
|
let zip: string | null = null;
|
||
|
|
|
||
|
|
const addressMatch = html.match(/Adres:<\/span>\s*(?:<strong>)?([\s\S]*?)(?:<\/strong>|<br|<\/p)/i);
|
||
|
|
if (addressMatch) {
|
||
|
|
address = addressMatch[1]
|
||
|
|
.replace(/<[^>]+>/g, '')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.trim() || null;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Also try the tomtom_codeAddress first argument as fallback address
|
||
|
|
if (!address) {
|
||
|
|
const tomtomAddrMatch = html.match(/tomtom_codeAddress\s*\(\s*'([^']+)'/);
|
||
|
|
if (tomtomAddrMatch) {
|
||
|
|
address = tomtomAddrMatch[1].trim() || null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (address) {
|
||
|
|
// Extract Polish postal code (XX-XXX format)
|
||
|
|
const zipMatch = address.match(/\b(\d{2}-\d{3})\b/);
|
||
|
|
if (zipMatch) {
|
||
|
|
zip = zipMatch[1];
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract city from address if not already from h1
|
||
|
|
if (!city) {
|
||
|
|
// City is typically the last part after comma
|
||
|
|
const parts = address.split(',');
|
||
|
|
if (parts.length > 1) {
|
||
|
|
city = parts[parts.length - 1].replace(/\d{2}-\d{3}/, '').trim() || null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Coordinates: from tomtom_codeAddress('addr', zoom, 'name', null, lat, lng)
|
||
|
|
let latitude = 0;
|
||
|
|
let longitude = 0;
|
||
|
|
const coordMatch = html.match(
|
||
|
|
/tomtom_codeAddress\s*\([^,]+,\s*\d+\s*,\s*[^,]+,\s*(?:null|'[^']*')\s*,\s*(-?[\d.]+)\s*,\s*(-?[\d.]+)\s*\)/
|
||
|
|
);
|
||
|
|
if (coordMatch) {
|
||
|
|
const lat = parseFloat(coordMatch[1]);
|
||
|
|
const lng = parseFloat(coordMatch[2]);
|
||
|
|
if (!isNaN(lat) && !isNaN(lng) && lat !== 0 && lng !== 0) {
|
||
|
|
latitude = lat;
|
||
|
|
longitude = lng;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Phone: <a href="tel:...">
|
||
|
|
let phone: string | null = null;
|
||
|
|
const phoneMatch = html.match(/<a\s+href="tel:([^"]+)"/i);
|
||
|
|
if (phoneMatch) {
|
||
|
|
phone = phoneMatch[1].trim() || null;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Website: look for external link near "Witryna" text
|
||
|
|
let website: string | null = null;
|
||
|
|
const websiteMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>[^<]*Witryna/i);
|
||
|
|
if (websiteMatch) {
|
||
|
|
website = websiteMatch[1].trim() || null;
|
||
|
|
}
|
||
|
|
// Also try: link text that looks like a URL (www.xxx)
|
||
|
|
if (!website) {
|
||
|
|
const wwwMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>www\.[^<]+<\/a>/i);
|
||
|
|
if (wwwMatch) {
|
||
|
|
website = wwwMatch[1].trim() || null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Email: not reliably available (Cloudflare-protected)
|
||
|
|
const email: string | null = null;
|
||
|
|
|
||
|
|
return { name, address, city, zip, phone, website, email, latitude, longitude };
|
||
|
|
}
|
||
|
|
|
||
|
|
function parseMassSchedule(html: string): ParsedSchedule[] {
|
||
|
|
const schedules: ParsedSchedule[] = [];
|
||
|
|
const seen = new Set<string>();
|
||
|
|
|
||
|
|
// Find mass schedule sections by h2/h3 headings containing "MSZE"
|
||
|
|
// Pattern: <h2>MSZE NIEDZIELE I ŚWIĘTA - Church Name</h2> followed by "godz. ..."
|
||
|
|
// Pattern: <h3>MSZE DNI POWSZEDNIE - Church Name</h3> followed by "godz. ..."
|
||
|
|
const sectionRegex = /<h[2-4][^>]*>([\s\S]*?)<\/h[2-4]>([\s\S]*?)(?=<h[2-4]|<footer|<script|$)/gi;
|
||
|
|
let sectionMatch;
|
||
|
|
|
||
|
|
while ((sectionMatch = sectionRegex.exec(html)) !== null) {
|
||
|
|
const heading = sectionMatch[1].replace(/<[^>]+>/g, '').trim().toUpperCase();
|
||
|
|
const content = sectionMatch[2];
|
||
|
|
|
||
|
|
// Only process mass schedule headings (starts with "MSZE")
|
||
|
|
if (!heading.startsWith('MSZE')) continue;
|
||
|
|
|
||
|
|
// Determine which days this section covers
|
||
|
|
const days = resolvePolishDays(heading);
|
||
|
|
if (days.length === 0) continue;
|
||
|
|
|
||
|
|
// Extract times from "godz." patterns
|
||
|
|
const times = extractTimes(content);
|
||
|
|
|
||
|
|
for (const day of days) {
|
||
|
|
for (const time of times) {
|
||
|
|
const key = `${day}:${time}`;
|
||
|
|
if (seen.has(key)) continue;
|
||
|
|
seen.add(key);
|
||
|
|
schedules.push({ dayOfWeek: day, time });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return schedules;
|
||
|
|
}
|
||
|
|
|
||
|
|
function resolvePolishDays(heading: string): number[] {
|
||
|
|
const h = heading; // already uppercased by caller
|
||
|
|
|
||
|
|
// "NIEDZIELE I ŚWIĘTA" or just "NIEDZIEL" → Sunday
|
||
|
|
if (h.includes('NIEDZIEL')) {
|
||
|
|
return [0];
|
||
|
|
}
|
||
|
|
|
||
|
|
// "DNI POWSZEDNIE" → Weekdays (Mon-Sat)
|
||
|
|
if (h.includes('DNI POWSZEDNIE') || h.includes('POWSZEDNI')) {
|
||
|
|
return [1, 2, 3, 4, 5, 6];
|
||
|
|
}
|
||
|
|
|
||
|
|
// Individual day names (rare but possible)
|
||
|
|
if (h.includes('PONIEDZIA')) return [1]; // poniedziałek
|
||
|
|
if (h.includes('WTOREK') || h.includes('WTORK')) return [2];
|
||
|
|
if (h.includes('ŚRODA') || h.includes('SRODA') || h.includes('ŚROD')) return [3];
|
||
|
|
if (h.includes('CZWARTEK') || h.includes('CZWART')) return [4];
|
||
|
|
if (h.includes('PIĄTEK') || h.includes('PIATEK') || h.includes('PIĄT')) return [5];
|
||
|
|
if (h.includes('SOBOT')) return [6];
|
||
|
|
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
function extractTimes(text: string): string[] {
|
||
|
|
const times: string[] = [];
|
||
|
|
|
||
|
|
// Match "godz." followed by times, or standalone HH:MM patterns
|
||
|
|
// Handles: "godz. 6:30, 8:00, 9:30" and "godz. 7:00"
|
||
|
|
const timeRegex = /(\d{1,2}):(\d{2})/g;
|
||
|
|
let match;
|
||
|
|
|
||
|
|
// Only look at text near "godz." patterns
|
||
|
|
const godzSections = text.split(/godz\.\s*/i);
|
||
|
|
|
||
|
|
for (let i = 1; i < godzSections.length; i++) {
|
||
|
|
// Take text until the next section break (paragraph, div, heading)
|
||
|
|
const section = godzSections[i].split(/<(?:p|div|br\s*\/?>|h[2-4])/i)[0];
|
||
|
|
|
||
|
|
while ((match = timeRegex.exec(section)) !== null) {
|
||
|
|
const hours = parseInt(match[1]);
|
||
|
|
const mins = parseInt(match[2]);
|
||
|
|
if (hours >= 0 && hours <= 23 && mins >= 0 && mins <= 59) {
|
||
|
|
times.push(`${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return times;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Database Operations ─────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function loadExistingPolishChurches(): Promise<ExistingChurch[]> {
|
||
|
|
console.log('Loading existing Polish churches for deduplication...');
|
||
|
|
const churches = await prisma.church.findMany({
|
||
|
|
where: { country: 'PL' },
|
||
|
|
select: {
|
||
|
|
id: true,
|
||
|
|
name: true,
|
||
|
|
latitude: true,
|
||
|
|
longitude: true,
|
||
|
|
osmId: true,
|
||
|
|
baiduId: true,
|
||
|
|
masstimesId: true,
|
||
|
|
orarimesseId: true,
|
||
|
|
massSchedulesPhId: true,
|
||
|
|
philmassId: true,
|
||
|
|
horariosMisasId: true,
|
||
|
|
mszeInfoId: true,
|
||
|
|
weekdayMassesId: true,
|
||
|
|
messesInfoId: true,
|
||
|
|
bohosluzbyId: true,
|
||
|
|
miserendId: true,
|
||
|
|
kerknetId: true,
|
||
|
|
gottesdienstzeitenId: true,
|
||
|
|
source: true,
|
||
|
|
website: true,
|
||
|
|
phone: true,
|
||
|
|
address: true,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
console.log(`Loaded ${churches.length} existing Polish churches`);
|
||
|
|
return churches;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Import Logic ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function processChurch(
|
||
|
|
churchId: string,
|
||
|
|
existingChurches: ExistingChurch[],
|
||
|
|
dryRun: boolean,
|
||
|
|
stats: ImportStats,
|
||
|
|
): Promise<void> {
|
||
|
|
stats.churchesFound++;
|
||
|
|
|
||
|
|
const url = `${SITE_BASE}/kosciol/${churchId}`;
|
||
|
|
const churchHtml = await fetchPage(url);
|
||
|
|
if (!churchHtml) {
|
||
|
|
stats.errors++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
const parsed = parseChurchPage(churchHtml);
|
||
|
|
if (!parsed.name) {
|
||
|
|
console.log(` Skipping ${churchId}: no name found`);
|
||
|
|
stats.churchesSkipped++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
const schedules = parseMassSchedule(churchHtml);
|
||
|
|
|
||
|
|
// Build candidate for dedup
|
||
|
|
const candidate = {
|
||
|
|
name: parsed.name,
|
||
|
|
lat: parsed.latitude,
|
||
|
|
lng: parsed.longitude,
|
||
|
|
mszeInfoId: churchId,
|
||
|
|
};
|
||
|
|
|
||
|
|
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||
|
|
|
||
|
|
if (dryRun) {
|
||
|
|
if (duplicate) {
|
||
|
|
stats.churchesMatched++;
|
||
|
|
console.log(` [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`);
|
||
|
|
} else {
|
||
|
|
stats.churchesCreated++;
|
||
|
|
console.log(` [NEW] "${parsed.name}" (${parsed.city || 'unknown city'})`);
|
||
|
|
}
|
||
|
|
if (schedules.length > 0) {
|
||
|
|
stats.schedulesProcessed++;
|
||
|
|
stats.massSchedulesCreated += schedules.length;
|
||
|
|
}
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (duplicate) {
|
||
|
|
// Update existing church
|
||
|
|
stats.churchesMatched++;
|
||
|
|
const updateData: Record<string, unknown> = {
|
||
|
|
mszeInfoId: churchId,
|
||
|
|
};
|
||
|
|
|
||
|
|
if (!duplicate.address && parsed.address) updateData.address = parsed.address;
|
||
|
|
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
|
||
|
|
if (!duplicate.website && parsed.website) {
|
||
|
|
updateData.website = parsed.website;
|
||
|
|
updateData.hasWebsite = true;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Update coordinates if existing has none and we have them
|
||
|
|
if (duplicate.latitude === 0 && duplicate.longitude === 0 && parsed.latitude !== 0) {
|
||
|
|
updateData.latitude = parsed.latitude;
|
||
|
|
updateData.longitude = parsed.longitude;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Fill city/zip if not set
|
||
|
|
const dbRecord = await prisma.church.findUnique({
|
||
|
|
where: { id: duplicate.id },
|
||
|
|
select: { city: true, zip: true, email: true },
|
||
|
|
});
|
||
|
|
if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city;
|
||
|
|
if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip;
|
||
|
|
if (dbRecord && !dbRecord.email && parsed.email) updateData.email = parsed.email;
|
||
|
|
|
||
|
|
try {
|
||
|
|
await prisma.church.update({
|
||
|
|
where: { id: duplicate.id },
|
||
|
|
data: updateData,
|
||
|
|
});
|
||
|
|
} catch (error) {
|
||
|
|
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||
|
|
stats.churchesSkipped++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
throw error;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Replace mass schedules
|
||
|
|
if (schedules.length > 0) {
|
||
|
|
try {
|
||
|
|
await prisma.$transaction(async (tx) => {
|
||
|
|
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||
|
|
await tx.massSchedule.createMany({
|
||
|
|
data: schedules.map((s) => ({
|
||
|
|
churchId: duplicate.id,
|
||
|
|
dayOfWeek: s.dayOfWeek,
|
||
|
|
time: s.time,
|
||
|
|
language: 'Polish',
|
||
|
|
})),
|
||
|
|
});
|
||
|
|
await tx.church.update({
|
||
|
|
where: { id: duplicate.id },
|
||
|
|
data: { lastScrapedAt: new Date() },
|
||
|
|
});
|
||
|
|
});
|
||
|
|
stats.schedulesProcessed++;
|
||
|
|
stats.massSchedulesCreated += schedules.length;
|
||
|
|
} catch (error) {
|
||
|
|
stats.errors++;
|
||
|
|
console.error(` Error saving schedules for ${churchId}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// Create new church
|
||
|
|
try {
|
||
|
|
const newChurch = await prisma.church.create({
|
||
|
|
data: {
|
||
|
|
name: parsed.name,
|
||
|
|
latitude: parsed.latitude,
|
||
|
|
longitude: parsed.longitude,
|
||
|
|
address: parsed.address,
|
||
|
|
zip: parsed.zip,
|
||
|
|
city: parsed.city,
|
||
|
|
country: 'PL',
|
||
|
|
phone: parsed.phone,
|
||
|
|
website: parsed.website,
|
||
|
|
email: parsed.email,
|
||
|
|
hasWebsite: !!parsed.website,
|
||
|
|
mszeInfoId: churchId,
|
||
|
|
source: 'msze-info',
|
||
|
|
},
|
||
|
|
});
|
||
|
|
stats.churchesCreated++;
|
||
|
|
|
||
|
|
// Add to in-memory array for within-run dedup
|
||
|
|
existingChurches.push({
|
||
|
|
id: newChurch.id,
|
||
|
|
name: parsed.name,
|
||
|
|
latitude: parsed.latitude,
|
||
|
|
longitude: parsed.longitude,
|
||
|
|
osmId: null,
|
||
|
|
baiduId: null,
|
||
|
|
masstimesId: null,
|
||
|
|
orarimesseId: null,
|
||
|
|
massSchedulesPhId: null,
|
||
|
|
philmassId: null,
|
||
|
|
horariosMisasId: null,
|
||
|
|
mszeInfoId: churchId,
|
||
|
|
weekdayMassesId: null,
|
||
|
|
messesInfoId: null,
|
||
|
|
bohosluzbyId: null,
|
||
|
|
miserendId: null,
|
||
|
|
kerknetId: null,
|
||
|
|
gottesdienstzeitenId: null,
|
||
|
|
source: 'msze-info',
|
||
|
|
website: parsed.website,
|
||
|
|
phone: parsed.phone,
|
||
|
|
address: parsed.address,
|
||
|
|
});
|
||
|
|
|
||
|
|
// Create mass schedules
|
||
|
|
if (schedules.length > 0) {
|
||
|
|
await prisma.massSchedule.createMany({
|
||
|
|
data: schedules.map((s) => ({
|
||
|
|
churchId: newChurch.id,
|
||
|
|
dayOfWeek: s.dayOfWeek,
|
||
|
|
time: s.time,
|
||
|
|
language: 'Polish',
|
||
|
|
})),
|
||
|
|
});
|
||
|
|
await prisma.church.update({
|
||
|
|
where: { id: newChurch.id },
|
||
|
|
data: { lastScrapedAt: new Date() },
|
||
|
|
});
|
||
|
|
stats.schedulesProcessed++;
|
||
|
|
stats.massSchedulesCreated += schedules.length;
|
||
|
|
}
|
||
|
|
} catch (error) {
|
||
|
|
if (error instanceof Error && error.message.includes('Unique constraint')) {
|
||
|
|
stats.churchesSkipped++;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
throw error;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function parseArgs(): CLIArgs {
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
const result: CLIArgs = {
|
||
|
|
all: false,
|
||
|
|
dryRun: false,
|
||
|
|
};
|
||
|
|
|
||
|
|
for (let i = 0; i < args.length; i++) {
|
||
|
|
switch (args[i]) {
|
||
|
|
case '--all':
|
||
|
|
result.all = true;
|
||
|
|
break;
|
||
|
|
case '--dry-run':
|
||
|
|
result.dryRun = true;
|
||
|
|
break;
|
||
|
|
case '--resume-from':
|
||
|
|
result.resumeFrom = parseInt(args[++i]);
|
||
|
|
break;
|
||
|
|
case '--job-id':
|
||
|
|
result.jobId = args[++i];
|
||
|
|
break;
|
||
|
|
case '--help':
|
||
|
|
case '-h':
|
||
|
|
console.log(`
|
||
|
|
Usage: npx tsx scripts/import-msze-info.ts [options]
|
||
|
|
|
||
|
|
Options:
|
||
|
|
--all Import all churches from sitemaps
|
||
|
|
--dry-run No database writes, just report what would happen
|
||
|
|
--resume-from <n> Skip first N churches
|
||
|
|
--job-id <uuid> Background job tracking ID
|
||
|
|
--help, -h Show this help message
|
||
|
|
|
||
|
|
Examples:
|
||
|
|
npx tsx scripts/import-msze-info.ts --all --dry-run
|
||
|
|
npx tsx scripts/import-msze-info.ts --all
|
||
|
|
npx tsx scripts/import-msze-info.ts --all --resume-from 500
|
||
|
|
`);
|
||
|
|
process.exit(0);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!result.all) {
|
||
|
|
console.error('Error: specify --all');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
function formatDuration(ms: number): string {
|
||
|
|
const seconds = Math.floor(ms / 1000);
|
||
|
|
const minutes = Math.floor(seconds / 60);
|
||
|
|
const hours = Math.floor(minutes / 60);
|
||
|
|
if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
||
|
|
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||
|
|
return `${seconds}s`;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Main ────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const args = parseArgs();
|
||
|
|
const startTime = Date.now();
|
||
|
|
|
||
|
|
console.log('\n' + '='.repeat(70));
|
||
|
|
console.log('MSZE.INFO (POLAND) IMPORTER');
|
||
|
|
console.log('='.repeat(70));
|
||
|
|
console.log(`Mode: All churches from sitemaps`);
|
||
|
|
console.log(`Dry run: ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
|
||
|
|
if (args.resumeFrom) console.log(`Resume from: ${args.resumeFrom}`);
|
||
|
|
console.log(`Time: ${new Date().toISOString()}`);
|
||
|
|
console.log('='.repeat(70) + '\n');
|
||
|
|
|
||
|
|
// Update background job status if provided
|
||
|
|
if (args.jobId) {
|
||
|
|
try {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: args.jobId },
|
||
|
|
data: { status: 'running', startedAt: new Date() },
|
||
|
|
});
|
||
|
|
} catch {
|
||
|
|
// Job might not exist yet
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const stats: ImportStats = {
|
||
|
|
churchesFound: 0,
|
||
|
|
churchesMatched: 0,
|
||
|
|
churchesCreated: 0,
|
||
|
|
churchesSkipped: 0,
|
||
|
|
schedulesProcessed: 0,
|
||
|
|
massSchedulesCreated: 0,
|
||
|
|
errors: 0,
|
||
|
|
};
|
||
|
|
|
||
|
|
// Load existing Polish churches for dedup
|
||
|
|
const existingChurches = await loadExistingPolishChurches();
|
||
|
|
|
||
|
|
// Fetch church IDs from sitemaps
|
||
|
|
console.log('Fetching church URLs from sitemaps...');
|
||
|
|
let churchIds = await fetchChurchUrlsFromSitemaps();
|
||
|
|
|
||
|
|
// Handle --resume-from
|
||
|
|
if (args.resumeFrom) {
|
||
|
|
const before = churchIds.length;
|
||
|
|
churchIds = churchIds.slice(args.resumeFrom);
|
||
|
|
console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchIds.length} churches)\n`);
|
||
|
|
} else {
|
||
|
|
console.log(`Processing ${churchIds.length} churches\n`);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Process each church
|
||
|
|
for (let i = 0; i < churchIds.length; i++) {
|
||
|
|
const id = churchIds[i];
|
||
|
|
const elapsed = formatDuration(Date.now() - startTime);
|
||
|
|
console.log(`[${i + 1}/${churchIds.length}] kosciol/${id} [${elapsed} elapsed]`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
await processChurch(id, existingChurches, args.dryRun, stats);
|
||
|
|
} catch (error) {
|
||
|
|
stats.errors++;
|
||
|
|
console.error(` ERROR processing ${id}: ${error instanceof Error ? error.message : error}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Print summary
|
||
|
|
const totalTime = Date.now() - startTime;
|
||
|
|
console.log('\n' + '='.repeat(70));
|
||
|
|
console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
|
||
|
|
console.log('='.repeat(70));
|
||
|
|
console.log(`Churches found: ${stats.churchesFound}`);
|
||
|
|
console.log(` Matched (existing): ${stats.churchesMatched}`);
|
||
|
|
console.log(` Created (new): ${stats.churchesCreated}`);
|
||
|
|
console.log(` Skipped: ${stats.churchesSkipped}`);
|
||
|
|
console.log(`Schedules processed: ${stats.schedulesProcessed}`);
|
||
|
|
console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
|
||
|
|
console.log(`Errors: ${stats.errors}`);
|
||
|
|
console.log(`Total time: ${formatDuration(totalTime)}`);
|
||
|
|
console.log(`HTTP requests: ${requestCount}`);
|
||
|
|
console.log('='.repeat(70) + '\n');
|
||
|
|
|
||
|
|
// Update background job
|
||
|
|
if (args.jobId) {
|
||
|
|
try {
|
||
|
|
await prisma.backgroundJob.update({
|
||
|
|
where: { id: args.jobId },
|
||
|
|
data: {
|
||
|
|
status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
|
||
|
|
completedAt: new Date(),
|
||
|
|
result: JSON.stringify(stats),
|
||
|
|
},
|
||
|
|
});
|
||
|
|
} catch {
|
||
|
|
// Ignore
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
main()
|
||
|
|
.catch((error) => {
|
||
|
|
console.error('Fatal error:', error);
|
||
|
|
process.exit(1);
|
||
|
|
})
|
||
|
|
.finally(async () => {
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
});
|