329 lines
10 KiB
TypeScript
329 lines
10 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Interactive helper to configure a new diocese for scraping
|
||
|
|
*
|
||
|
|
* Usage:
|
||
|
|
* npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de
|
||
|
|
* npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr
|
||
|
|
* npx tsx scripts/setup-diocese.ts --list # List all configured dioceses
|
||
|
|
* npx tsx scripts/setup-diocese.ts --test <diocese-id> # Test scraping a diocese
|
||
|
|
*/
|
||
|
|
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
import path from 'path';
|
||
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||
|
|
|
||
|
|
import { Pool } from 'pg';
|
||
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
||
|
|
import { PrismaClient } from '@prisma/client';
|
||
|
|
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
|
||
|
|
import readline from 'readline';
|
||
|
|
|
||
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||
|
|
const adapter = new PrismaPg(pool);
|
||
|
|
const prisma = new PrismaClient({ adapter });
|
||
|
|
|
||
|
|
function log(msg: string) {
|
||
|
|
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
function logError(msg: string) {
|
||
|
|
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
function ask(question: string): Promise<string> {
|
||
|
|
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
||
|
|
return new Promise(resolve => {
|
||
|
|
rl.question(question, answer => {
|
||
|
|
rl.close();
|
||
|
|
resolve(answer.trim());
|
||
|
|
});
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
async function listDioceses() {
|
||
|
|
const dioceses = await prisma.diocese.findMany({
|
||
|
|
orderBy: [{ country: 'asc' }, { name: 'asc' }],
|
||
|
|
});
|
||
|
|
|
||
|
|
if (dioceses.length === 0) {
|
||
|
|
log('No dioceses configured yet.');
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('\nConfigured Dioceses:');
|
||
|
|
console.log('─'.repeat(100));
|
||
|
|
console.log(
|
||
|
|
'ID'.padEnd(38) +
|
||
|
|
'Name'.padEnd(30) +
|
||
|
|
'Country'.padEnd(10) +
|
||
|
|
'Active'.padEnd(8) +
|
||
|
|
'Churches'.padEnd(10) +
|
||
|
|
'Last Scraped'
|
||
|
|
);
|
||
|
|
console.log('─'.repeat(100));
|
||
|
|
|
||
|
|
for (const d of dioceses) {
|
||
|
|
console.log(
|
||
|
|
d.id.padEnd(38) +
|
||
|
|
d.name.substring(0, 28).padEnd(30) +
|
||
|
|
d.country.padEnd(10) +
|
||
|
|
(d.active ? 'Yes' : 'No').padEnd(8) +
|
||
|
|
String(d.churchCount).padEnd(10) +
|
||
|
|
(d.lastScrapedAt ? d.lastScrapedAt.toISOString().split('T')[0] : 'Never')
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('─'.repeat(100));
|
||
|
|
console.log(`Total: ${dioceses.length} dioceses`);
|
||
|
|
}
|
||
|
|
|
||
|
|
async function testDiocese(dioceseId: string) {
|
||
|
|
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
|
||
|
|
if (!diocese) {
|
||
|
|
logError(`Diocese not found: ${dioceseId}`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!diocese.directoryUrl) {
|
||
|
|
logError(`Diocese ${diocese.name} has no directory URL`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
|
||
|
|
if (!config?.selectors) {
|
||
|
|
logError(`Diocese ${diocese.name} has no scrape config`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
log(`Testing diocese: ${diocese.name}`);
|
||
|
|
log(`Directory URL: ${diocese.directoryUrl}`);
|
||
|
|
log('');
|
||
|
|
|
||
|
|
const scraper = new DioceseDirectoryScraper();
|
||
|
|
try {
|
||
|
|
const parishes = await scraper.scrapeDirectory(diocese.directoryUrl, config);
|
||
|
|
|
||
|
|
log(`\nDiscovered ${parishes.length} parishes:\n`);
|
||
|
|
for (const p of parishes.slice(0, 10)) {
|
||
|
|
console.log(` ${p.name}`);
|
||
|
|
console.log(` URL: ${p.url}`);
|
||
|
|
if (p.address) console.log(` Address: ${p.address}`);
|
||
|
|
if (p.city) console.log(` City: ${p.city}`);
|
||
|
|
console.log('');
|
||
|
|
}
|
||
|
|
|
||
|
|
if (parishes.length > 10) {
|
||
|
|
console.log(` ... and ${parishes.length - 10} more`);
|
||
|
|
}
|
||
|
|
} finally {
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function setupDiocese(url: string, country: string, language: string) {
|
||
|
|
log(`Setting up diocese from: ${url}`);
|
||
|
|
log(`Country: ${country}, Language: ${language}`);
|
||
|
|
|
||
|
|
// Ask for diocese name
|
||
|
|
const name = await ask('\nDiocese name (e.g. "Bistum Mainz"): ');
|
||
|
|
if (!name) {
|
||
|
|
logError('Name is required');
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check if already exists
|
||
|
|
const existing = await prisma.diocese.findFirst({
|
||
|
|
where: { name, country },
|
||
|
|
});
|
||
|
|
if (existing) {
|
||
|
|
logError(`Diocese "${name}" already exists in ${country} (ID: ${existing.id})`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Probe the page structure
|
||
|
|
log('\nProbing page structure...');
|
||
|
|
const scraper = new DioceseDirectoryScraper();
|
||
|
|
await scraper.init();
|
||
|
|
|
||
|
|
try {
|
||
|
|
const page = (scraper as any).page;
|
||
|
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||
|
|
|
||
|
|
// Analyze page - count links and common patterns
|
||
|
|
const analysis = await page.evaluate(() => {
|
||
|
|
const links = Array.from(document.querySelectorAll('a'));
|
||
|
|
const linkPatterns: Record<string, number> = {};
|
||
|
|
|
||
|
|
for (const link of links) {
|
||
|
|
const href = link.href;
|
||
|
|
if (!href) continue;
|
||
|
|
// Extract pattern from URL path
|
||
|
|
try {
|
||
|
|
const path = new URL(href).pathname;
|
||
|
|
const segments = path.split('/').filter(Boolean);
|
||
|
|
if (segments.length >= 1) {
|
||
|
|
const pattern = '/' + segments.slice(0, -1).join('/') + '/*';
|
||
|
|
linkPatterns[pattern] = (linkPatterns[pattern] || 0) + 1;
|
||
|
|
}
|
||
|
|
} catch { /* ignore */ }
|
||
|
|
}
|
||
|
|
|
||
|
|
// Find most common list-like elements
|
||
|
|
const listSelectors = [
|
||
|
|
'ul li', 'ol li', 'div.parish', 'div.item', 'article',
|
||
|
|
'tr', '.card', '.entry', '.listing', '.result',
|
||
|
|
];
|
||
|
|
|
||
|
|
const selectorCounts: Record<string, number> = {};
|
||
|
|
for (const sel of listSelectors) {
|
||
|
|
selectorCounts[sel] = document.querySelectorAll(sel).length;
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
title: document.title,
|
||
|
|
totalLinks: links.length,
|
||
|
|
linkPatterns: Object.entries(linkPatterns)
|
||
|
|
.sort(([, a], [, b]) => b - a)
|
||
|
|
.slice(0, 10),
|
||
|
|
selectorCounts,
|
||
|
|
bodyTextLength: document.body?.textContent?.length || 0,
|
||
|
|
};
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log(`\nPage: ${analysis.title}`);
|
||
|
|
console.log(`Total links: ${analysis.totalLinks}`);
|
||
|
|
console.log(`\nMost common link patterns:`);
|
||
|
|
for (const [pattern, count] of analysis.linkPatterns) {
|
||
|
|
console.log(` ${pattern}: ${count} links`);
|
||
|
|
}
|
||
|
|
console.log(`\nElement counts:`);
|
||
|
|
for (const [sel, count] of Object.entries(analysis.selectorCounts)) {
|
||
|
|
if (count > 0) console.log(` ${sel}: ${count}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Ask for selectors
|
||
|
|
console.log('\nNow configure CSS selectors for this diocese.\n');
|
||
|
|
|
||
|
|
const parishList = await ask('Parish list container selector (e.g. "ul.parishes li", ".parish-item"): ');
|
||
|
|
const parishLink = await ask('Parish link selector within container (e.g. "a", "a.parish-link"): ');
|
||
|
|
const parishName = await ask('Parish name selector (leave empty to use link text): ') || undefined;
|
||
|
|
const parishAddress = await ask('Address selector (leave empty if none): ') || undefined;
|
||
|
|
const parishCity = await ask('City selector (leave empty if none): ') || undefined;
|
||
|
|
const pagination = await ask('Pagination "next" selector (leave empty if none): ') || undefined;
|
||
|
|
const urlPatternStr = await ask('URL pattern regex (leave empty for all): ') || undefined;
|
||
|
|
const waitForSelector = await ask('Wait for selector (leave empty if not needed): ') || undefined;
|
||
|
|
|
||
|
|
const scrapeConfig: DioceseScrapeConfig = {
|
||
|
|
selectors: {
|
||
|
|
parishList,
|
||
|
|
parishLink,
|
||
|
|
parishName,
|
||
|
|
parishAddress,
|
||
|
|
parishCity,
|
||
|
|
pagination,
|
||
|
|
},
|
||
|
|
urlPattern: urlPatternStr,
|
||
|
|
waitForSelector,
|
||
|
|
maxPages: 50,
|
||
|
|
scheduleInDirectory: false,
|
||
|
|
};
|
||
|
|
|
||
|
|
// Test the config
|
||
|
|
console.log('\nTesting selectors...');
|
||
|
|
const testResults = await page.$$eval(
|
||
|
|
parishList,
|
||
|
|
(elements: Element[], linkSel: string) => {
|
||
|
|
return elements.slice(0, 5).map(el => {
|
||
|
|
const link = el.querySelector(linkSel);
|
||
|
|
return {
|
||
|
|
name: link?.textContent?.trim() || el.textContent?.trim()?.substring(0, 80) || '(empty)',
|
||
|
|
url: link?.getAttribute('href') || '(no link)',
|
||
|
|
};
|
||
|
|
});
|
||
|
|
},
|
||
|
|
parishLink
|
||
|
|
);
|
||
|
|
|
||
|
|
console.log(`\nTest extraction (first 5):`);
|
||
|
|
for (const r of testResults) {
|
||
|
|
console.log(` ${r.name}`);
|
||
|
|
console.log(` -> ${r.url}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
const confirm = await ask('\nSave this configuration? (yes/no): ');
|
||
|
|
if (confirm.toLowerCase() !== 'yes' && confirm.toLowerCase() !== 'y') {
|
||
|
|
log('Cancelled.');
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Save to database
|
||
|
|
const diocese = await prisma.diocese.create({
|
||
|
|
data: {
|
||
|
|
name,
|
||
|
|
country,
|
||
|
|
language,
|
||
|
|
website: new URL(url).origin,
|
||
|
|
directoryUrl: url,
|
||
|
|
scrapeConfig: scrapeConfig as any,
|
||
|
|
active: true,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
log(`\nDiocese saved! ID: ${diocese.id}`);
|
||
|
|
log(`Run: npx tsx scripts/scrape-diocese-directory.ts --diocese ${diocese.id} --dry-run`);
|
||
|
|
} finally {
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
|
||
|
|
if (args.includes('--list')) {
|
||
|
|
await listDioceses();
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
const testIdx = args.indexOf('--test');
|
||
|
|
if (testIdx !== -1) {
|
||
|
|
await testDiocese(args[testIdx + 1]);
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
const urlIdx = args.indexOf('--url');
|
||
|
|
const countryIdx = args.indexOf('--country');
|
||
|
|
const langIdx = args.indexOf('--language');
|
||
|
|
|
||
|
|
if (urlIdx === -1 || countryIdx === -1) {
|
||
|
|
console.log('Usage:');
|
||
|
|
console.log(' npx tsx scripts/setup-diocese.ts --url <directory-url> --country <CC> --language <lang>');
|
||
|
|
console.log(' npx tsx scripts/setup-diocese.ts --list');
|
||
|
|
console.log(' npx tsx scripts/setup-diocese.ts --test <diocese-id>');
|
||
|
|
console.log('');
|
||
|
|
console.log('Examples:');
|
||
|
|
console.log(' npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de');
|
||
|
|
console.log(' npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr');
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
const url = args[urlIdx + 1];
|
||
|
|
const country = args[countryIdx + 1];
|
||
|
|
const language = langIdx !== -1 ? args[langIdx + 1] : country.toLowerCase();
|
||
|
|
|
||
|
|
await setupDiocese(url, country, language);
|
||
|
|
await prisma.$disconnect();
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
|
||
|
|
main().catch((error) => {
|
||
|
|
logError(`Fatal error: ${error.message}`);
|
||
|
|
process.exit(1);
|
||
|
|
});
|