#!/usr/bin/env tsx /** * Interactive helper to configure a new diocese for scraping * * Usage: * npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de * npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr * npx tsx scripts/setup-diocese.ts --list # List all configured dioceses * npx tsx scripts/setup-diocese.ts --test # Test scraping a diocese */ import dotenv from 'dotenv'; import path from 'path'; dotenv.config({ path: path.resolve(process.cwd(), '.env') }); import { Pool } from 'pg'; import { PrismaPg } from '@prisma/adapter-pg'; import { PrismaClient } from '@prisma/client'; import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper'; import readline from 'readline'; const pool = new Pool({ connectionString: process.env.DATABASE_URL }); const adapter = new PrismaPg(pool); const prisma = new PrismaClient({ adapter }); function log(msg: string) { console.log(`[${new Date().toISOString()}] ${msg}`); } function logError(msg: string) { console.error(`[${new Date().toISOString()}] ERROR: ${msg}`); } function ask(question: string): Promise { const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); return new Promise(resolve => { rl.question(question, answer => { rl.close(); resolve(answer.trim()); }); }); } async function listDioceses() { const dioceses = await prisma.diocese.findMany({ orderBy: [{ country: 'asc' }, { name: 'asc' }], }); if (dioceses.length === 0) { log('No dioceses configured yet.'); return; } console.log('\nConfigured Dioceses:'); console.log('─'.repeat(100)); console.log( 'ID'.padEnd(38) + 'Name'.padEnd(30) + 'Country'.padEnd(10) + 'Active'.padEnd(8) + 'Churches'.padEnd(10) + 'Last Scraped' ); console.log('─'.repeat(100)); for (const d of dioceses) { console.log( d.id.padEnd(38) + d.name.substring(0, 28).padEnd(30) + d.country.padEnd(10) + (d.active ? 'Yes' : 'No').padEnd(8) + String(d.churchCount).padEnd(10) + (d.lastScrapedAt ? d.lastScrapedAt.toISOString().split('T')[0] : 'Never') ); } console.log('─'.repeat(100)); console.log(`Total: ${dioceses.length} dioceses`); } async function testDiocese(dioceseId: string) { const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } }); if (!diocese) { logError(`Diocese not found: ${dioceseId}`); return; } if (!diocese.directoryUrl) { logError(`Diocese ${diocese.name} has no directory URL`); return; } const config = diocese.scrapeConfig as DioceseScrapeConfig | null; if (!config?.selectors) { logError(`Diocese ${diocese.name} has no scrape config`); return; } log(`Testing diocese: ${diocese.name}`); log(`Directory URL: ${diocese.directoryUrl}`); log(''); const scraper = new DioceseDirectoryScraper(); try { const parishes = await scraper.scrapeDirectory(diocese.directoryUrl, config); log(`\nDiscovered ${parishes.length} parishes:\n`); for (const p of parishes.slice(0, 10)) { console.log(` ${p.name}`); console.log(` URL: ${p.url}`); if (p.address) console.log(` Address: ${p.address}`); if (p.city) console.log(` City: ${p.city}`); console.log(''); } if (parishes.length > 10) { console.log(` ... and ${parishes.length - 10} more`); } } finally { await scraper.close(); } } async function setupDiocese(url: string, country: string, language: string) { log(`Setting up diocese from: ${url}`); log(`Country: ${country}, Language: ${language}`); // Ask for diocese name const name = await ask('\nDiocese name (e.g. "Bistum Mainz"): '); if (!name) { logError('Name is required'); return; } // Check if already exists const existing = await prisma.diocese.findFirst({ where: { name, country }, }); if (existing) { logError(`Diocese "${name}" already exists in ${country} (ID: ${existing.id})`); return; } // Probe the page structure log('\nProbing page structure...'); const scraper = new DioceseDirectoryScraper(); await scraper.init(); try { const page = (scraper as any).page; await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); // Analyze page - count links and common patterns const analysis = await page.evaluate(() => { const links = Array.from(document.querySelectorAll('a')); const linkPatterns: Record = {}; for (const link of links) { const href = link.href; if (!href) continue; // Extract pattern from URL path try { const path = new URL(href).pathname; const segments = path.split('/').filter(Boolean); if (segments.length >= 1) { const pattern = '/' + segments.slice(0, -1).join('/') + '/*'; linkPatterns[pattern] = (linkPatterns[pattern] || 0) + 1; } } catch { /* ignore */ } } // Find most common list-like elements const listSelectors = [ 'ul li', 'ol li', 'div.parish', 'div.item', 'article', 'tr', '.card', '.entry', '.listing', '.result', ]; const selectorCounts: Record = {}; for (const sel of listSelectors) { selectorCounts[sel] = document.querySelectorAll(sel).length; } return { title: document.title, totalLinks: links.length, linkPatterns: Object.entries(linkPatterns) .sort(([, a], [, b]) => b - a) .slice(0, 10), selectorCounts, bodyTextLength: document.body?.textContent?.length || 0, }; }); console.log(`\nPage: ${analysis.title}`); console.log(`Total links: ${analysis.totalLinks}`); console.log(`\nMost common link patterns:`); for (const [pattern, count] of analysis.linkPatterns) { console.log(` ${pattern}: ${count} links`); } console.log(`\nElement counts:`); for (const [sel, count] of Object.entries(analysis.selectorCounts)) { if (count > 0) console.log(` ${sel}: ${count}`); } // Ask for selectors console.log('\nNow configure CSS selectors for this diocese.\n'); const parishList = await ask('Parish list container selector (e.g. "ul.parishes li", ".parish-item"): '); const parishLink = await ask('Parish link selector within container (e.g. "a", "a.parish-link"): '); const parishName = await ask('Parish name selector (leave empty to use link text): ') || undefined; const parishAddress = await ask('Address selector (leave empty if none): ') || undefined; const parishCity = await ask('City selector (leave empty if none): ') || undefined; const pagination = await ask('Pagination "next" selector (leave empty if none): ') || undefined; const urlPatternStr = await ask('URL pattern regex (leave empty for all): ') || undefined; const waitForSelector = await ask('Wait for selector (leave empty if not needed): ') || undefined; const scrapeConfig: DioceseScrapeConfig = { selectors: { parishList, parishLink, parishName, parishAddress, parishCity, pagination, }, urlPattern: urlPatternStr, waitForSelector, maxPages: 50, scheduleInDirectory: false, }; // Test the config console.log('\nTesting selectors...'); const testResults = await page.$$eval( parishList, (elements: Element[], linkSel: string) => { return elements.slice(0, 5).map(el => { const link = el.querySelector(linkSel); return { name: link?.textContent?.trim() || el.textContent?.trim()?.substring(0, 80) || '(empty)', url: link?.getAttribute('href') || '(no link)', }; }); }, parishLink ); console.log(`\nTest extraction (first 5):`); for (const r of testResults) { console.log(` ${r.name}`); console.log(` -> ${r.url}`); } const confirm = await ask('\nSave this configuration? (yes/no): '); if (confirm.toLowerCase() !== 'yes' && confirm.toLowerCase() !== 'y') { log('Cancelled.'); return; } // Save to database const diocese = await prisma.diocese.create({ data: { name, country, language, website: new URL(url).origin, directoryUrl: url, scrapeConfig: scrapeConfig as any, active: true, }, }); log(`\nDiocese saved! ID: ${diocese.id}`); log(`Run: npx tsx scripts/scrape-diocese-directory.ts --diocese ${diocese.id} --dry-run`); } finally { await scraper.close(); } } async function main() { const args = process.argv.slice(2); if (args.includes('--list')) { await listDioceses(); await prisma.$disconnect(); await pool.end(); return; } const testIdx = args.indexOf('--test'); if (testIdx !== -1) { await testDiocese(args[testIdx + 1]); await prisma.$disconnect(); await pool.end(); return; } const urlIdx = args.indexOf('--url'); const countryIdx = args.indexOf('--country'); const langIdx = args.indexOf('--language'); if (urlIdx === -1 || countryIdx === -1) { console.log('Usage:'); console.log(' npx tsx scripts/setup-diocese.ts --url --country --language '); console.log(' npx tsx scripts/setup-diocese.ts --list'); console.log(' npx tsx scripts/setup-diocese.ts --test '); console.log(''); console.log('Examples:'); console.log(' npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de'); console.log(' npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr'); await prisma.$disconnect(); await pool.end(); return; } const url = args[urlIdx + 1]; const country = args[countryIdx + 1]; const language = langIdx !== -1 ? args[langIdx + 1] : country.toLowerCase(); await setupDiocese(url, country, language); await prisma.$disconnect(); await pool.end(); } main().catch((error) => { logError(`Fatal error: ${error.message}`); process.exit(1); });