Files
ScraperControl/scripts/setup-diocese.ts

329 lines
10 KiB
TypeScript
Raw Permalink Normal View History

#!/usr/bin/env tsx
/**
* Interactive helper to configure a new diocese for scraping
*
* Usage:
* npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de
* npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr
* npx tsx scripts/setup-diocese.ts --list # List all configured dioceses
* npx tsx scripts/setup-diocese.ts --test <diocese-id> # Test scraping a diocese
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
import readline from 'readline';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function logError(msg: string) {
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
}
function ask(question: string): Promise<string> {
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
return new Promise(resolve => {
rl.question(question, answer => {
rl.close();
resolve(answer.trim());
});
});
}
async function listDioceses() {
const dioceses = await prisma.diocese.findMany({
orderBy: [{ country: 'asc' }, { name: 'asc' }],
});
if (dioceses.length === 0) {
log('No dioceses configured yet.');
return;
}
console.log('\nConfigured Dioceses:');
console.log('─'.repeat(100));
console.log(
'ID'.padEnd(38) +
'Name'.padEnd(30) +
'Country'.padEnd(10) +
'Active'.padEnd(8) +
'Churches'.padEnd(10) +
'Last Scraped'
);
console.log('─'.repeat(100));
for (const d of dioceses) {
console.log(
d.id.padEnd(38) +
d.name.substring(0, 28).padEnd(30) +
d.country.padEnd(10) +
(d.active ? 'Yes' : 'No').padEnd(8) +
String(d.churchCount).padEnd(10) +
(d.lastScrapedAt ? d.lastScrapedAt.toISOString().split('T')[0] : 'Never')
);
}
console.log('─'.repeat(100));
console.log(`Total: ${dioceses.length} dioceses`);
}
async function testDiocese(dioceseId: string) {
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
if (!diocese) {
logError(`Diocese not found: ${dioceseId}`);
return;
}
if (!diocese.directoryUrl) {
logError(`Diocese ${diocese.name} has no directory URL`);
return;
}
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
if (!config?.selectors) {
logError(`Diocese ${diocese.name} has no scrape config`);
return;
}
log(`Testing diocese: ${diocese.name}`);
log(`Directory URL: ${diocese.directoryUrl}`);
log('');
const scraper = new DioceseDirectoryScraper();
try {
const parishes = await scraper.scrapeDirectory(diocese.directoryUrl, config);
log(`\nDiscovered ${parishes.length} parishes:\n`);
for (const p of parishes.slice(0, 10)) {
console.log(` ${p.name}`);
console.log(` URL: ${p.url}`);
if (p.address) console.log(` Address: ${p.address}`);
if (p.city) console.log(` City: ${p.city}`);
console.log('');
}
if (parishes.length > 10) {
console.log(` ... and ${parishes.length - 10} more`);
}
} finally {
await scraper.close();
}
}
async function setupDiocese(url: string, country: string, language: string) {
log(`Setting up diocese from: ${url}`);
log(`Country: ${country}, Language: ${language}`);
// Ask for diocese name
const name = await ask('\nDiocese name (e.g. "Bistum Mainz"): ');
if (!name) {
logError('Name is required');
return;
}
// Check if already exists
const existing = await prisma.diocese.findFirst({
where: { name, country },
});
if (existing) {
logError(`Diocese "${name}" already exists in ${country} (ID: ${existing.id})`);
return;
}
// Probe the page structure
log('\nProbing page structure...');
const scraper = new DioceseDirectoryScraper();
await scraper.init();
try {
const page = (scraper as any).page;
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
// Analyze page - count links and common patterns
const analysis = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a'));
const linkPatterns: Record<string, number> = {};
for (const link of links) {
const href = link.href;
if (!href) continue;
// Extract pattern from URL path
try {
const path = new URL(href).pathname;
const segments = path.split('/').filter(Boolean);
if (segments.length >= 1) {
const pattern = '/' + segments.slice(0, -1).join('/') + '/*';
linkPatterns[pattern] = (linkPatterns[pattern] || 0) + 1;
}
} catch { /* ignore */ }
}
// Find most common list-like elements
const listSelectors = [
'ul li', 'ol li', 'div.parish', 'div.item', 'article',
'tr', '.card', '.entry', '.listing', '.result',
];
const selectorCounts: Record<string, number> = {};
for (const sel of listSelectors) {
selectorCounts[sel] = document.querySelectorAll(sel).length;
}
return {
title: document.title,
totalLinks: links.length,
linkPatterns: Object.entries(linkPatterns)
.sort(([, a], [, b]) => b - a)
.slice(0, 10),
selectorCounts,
bodyTextLength: document.body?.textContent?.length || 0,
};
});
console.log(`\nPage: ${analysis.title}`);
console.log(`Total links: ${analysis.totalLinks}`);
console.log(`\nMost common link patterns:`);
for (const [pattern, count] of analysis.linkPatterns) {
console.log(` ${pattern}: ${count} links`);
}
console.log(`\nElement counts:`);
for (const [sel, count] of Object.entries(analysis.selectorCounts)) {
if (count > 0) console.log(` ${sel}: ${count}`);
}
// Ask for selectors
console.log('\nNow configure CSS selectors for this diocese.\n');
const parishList = await ask('Parish list container selector (e.g. "ul.parishes li", ".parish-item"): ');
const parishLink = await ask('Parish link selector within container (e.g. "a", "a.parish-link"): ');
const parishName = await ask('Parish name selector (leave empty to use link text): ') || undefined;
const parishAddress = await ask('Address selector (leave empty if none): ') || undefined;
const parishCity = await ask('City selector (leave empty if none): ') || undefined;
const pagination = await ask('Pagination "next" selector (leave empty if none): ') || undefined;
const urlPatternStr = await ask('URL pattern regex (leave empty for all): ') || undefined;
const waitForSelector = await ask('Wait for selector (leave empty if not needed): ') || undefined;
const scrapeConfig: DioceseScrapeConfig = {
selectors: {
parishList,
parishLink,
parishName,
parishAddress,
parishCity,
pagination,
},
urlPattern: urlPatternStr,
waitForSelector,
maxPages: 50,
scheduleInDirectory: false,
};
// Test the config
console.log('\nTesting selectors...');
const testResults = await page.$$eval(
parishList,
(elements: Element[], linkSel: string) => {
return elements.slice(0, 5).map(el => {
const link = el.querySelector(linkSel);
return {
name: link?.textContent?.trim() || el.textContent?.trim()?.substring(0, 80) || '(empty)',
url: link?.getAttribute('href') || '(no link)',
};
});
},
parishLink
);
console.log(`\nTest extraction (first 5):`);
for (const r of testResults) {
console.log(` ${r.name}`);
console.log(` -> ${r.url}`);
}
const confirm = await ask('\nSave this configuration? (yes/no): ');
if (confirm.toLowerCase() !== 'yes' && confirm.toLowerCase() !== 'y') {
log('Cancelled.');
return;
}
// Save to database
const diocese = await prisma.diocese.create({
data: {
name,
country,
language,
website: new URL(url).origin,
directoryUrl: url,
scrapeConfig: scrapeConfig as any,
active: true,
},
});
log(`\nDiocese saved! ID: ${diocese.id}`);
log(`Run: npx tsx scripts/scrape-diocese-directory.ts --diocese ${diocese.id} --dry-run`);
} finally {
await scraper.close();
}
}
async function main() {
const args = process.argv.slice(2);
if (args.includes('--list')) {
await listDioceses();
await prisma.$disconnect();
await pool.end();
return;
}
const testIdx = args.indexOf('--test');
if (testIdx !== -1) {
await testDiocese(args[testIdx + 1]);
await prisma.$disconnect();
await pool.end();
return;
}
const urlIdx = args.indexOf('--url');
const countryIdx = args.indexOf('--country');
const langIdx = args.indexOf('--language');
if (urlIdx === -1 || countryIdx === -1) {
console.log('Usage:');
console.log(' npx tsx scripts/setup-diocese.ts --url <directory-url> --country <CC> --language <lang>');
console.log(' npx tsx scripts/setup-diocese.ts --list');
console.log(' npx tsx scripts/setup-diocese.ts --test <diocese-id>');
console.log('');
console.log('Examples:');
console.log(' npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de');
console.log(' npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr');
await prisma.$disconnect();
await pool.end();
return;
}
const url = args[urlIdx + 1];
const country = args[countryIdx + 1];
const language = langIdx !== -1 ? args[langIdx + 1] : country.toLowerCase();
await setupDiocese(url, country, language);
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
logError(`Fatal error: ${error.message}`);
process.exit(1);
});