chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
328
scripts/setup-diocese.ts
Executable file
328
scripts/setup-diocese.ts
Executable file
@@ -0,0 +1,328 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Interactive helper to configure a new diocese for scraping
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de
|
||||
* npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr
|
||||
* npx tsx scripts/setup-diocese.ts --list # List all configured dioceses
|
||||
* npx tsx scripts/setup-diocese.ts --test <diocese-id> # Test scraping a diocese
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
|
||||
import readline from 'readline';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function logError(msg: string) {
|
||||
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
|
||||
}
|
||||
|
||||
function ask(question: string): Promise<string> {
|
||||
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
||||
return new Promise(resolve => {
|
||||
rl.question(question, answer => {
|
||||
rl.close();
|
||||
resolve(answer.trim());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function listDioceses() {
|
||||
const dioceses = await prisma.diocese.findMany({
|
||||
orderBy: [{ country: 'asc' }, { name: 'asc' }],
|
||||
});
|
||||
|
||||
if (dioceses.length === 0) {
|
||||
log('No dioceses configured yet.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log('\nConfigured Dioceses:');
|
||||
console.log('─'.repeat(100));
|
||||
console.log(
|
||||
'ID'.padEnd(38) +
|
||||
'Name'.padEnd(30) +
|
||||
'Country'.padEnd(10) +
|
||||
'Active'.padEnd(8) +
|
||||
'Churches'.padEnd(10) +
|
||||
'Last Scraped'
|
||||
);
|
||||
console.log('─'.repeat(100));
|
||||
|
||||
for (const d of dioceses) {
|
||||
console.log(
|
||||
d.id.padEnd(38) +
|
||||
d.name.substring(0, 28).padEnd(30) +
|
||||
d.country.padEnd(10) +
|
||||
(d.active ? 'Yes' : 'No').padEnd(8) +
|
||||
String(d.churchCount).padEnd(10) +
|
||||
(d.lastScrapedAt ? d.lastScrapedAt.toISOString().split('T')[0] : 'Never')
|
||||
);
|
||||
}
|
||||
|
||||
console.log('─'.repeat(100));
|
||||
console.log(`Total: ${dioceses.length} dioceses`);
|
||||
}
|
||||
|
||||
async function testDiocese(dioceseId: string) {
|
||||
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
|
||||
if (!diocese) {
|
||||
logError(`Diocese not found: ${dioceseId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!diocese.directoryUrl) {
|
||||
logError(`Diocese ${diocese.name} has no directory URL`);
|
||||
return;
|
||||
}
|
||||
|
||||
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
|
||||
if (!config?.selectors) {
|
||||
logError(`Diocese ${diocese.name} has no scrape config`);
|
||||
return;
|
||||
}
|
||||
|
||||
log(`Testing diocese: ${diocese.name}`);
|
||||
log(`Directory URL: ${diocese.directoryUrl}`);
|
||||
log('');
|
||||
|
||||
const scraper = new DioceseDirectoryScraper();
|
||||
try {
|
||||
const parishes = await scraper.scrapeDirectory(diocese.directoryUrl, config);
|
||||
|
||||
log(`\nDiscovered ${parishes.length} parishes:\n`);
|
||||
for (const p of parishes.slice(0, 10)) {
|
||||
console.log(` ${p.name}`);
|
||||
console.log(` URL: ${p.url}`);
|
||||
if (p.address) console.log(` Address: ${p.address}`);
|
||||
if (p.city) console.log(` City: ${p.city}`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (parishes.length > 10) {
|
||||
console.log(` ... and ${parishes.length - 10} more`);
|
||||
}
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function setupDiocese(url: string, country: string, language: string) {
|
||||
log(`Setting up diocese from: ${url}`);
|
||||
log(`Country: ${country}, Language: ${language}`);
|
||||
|
||||
// Ask for diocese name
|
||||
const name = await ask('\nDiocese name (e.g. "Bistum Mainz"): ');
|
||||
if (!name) {
|
||||
logError('Name is required');
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if already exists
|
||||
const existing = await prisma.diocese.findFirst({
|
||||
where: { name, country },
|
||||
});
|
||||
if (existing) {
|
||||
logError(`Diocese "${name}" already exists in ${country} (ID: ${existing.id})`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Probe the page structure
|
||||
log('\nProbing page structure...');
|
||||
const scraper = new DioceseDirectoryScraper();
|
||||
await scraper.init();
|
||||
|
||||
try {
|
||||
const page = (scraper as any).page;
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
|
||||
// Analyze page - count links and common patterns
|
||||
const analysis = await page.evaluate(() => {
|
||||
const links = Array.from(document.querySelectorAll('a'));
|
||||
const linkPatterns: Record<string, number> = {};
|
||||
|
||||
for (const link of links) {
|
||||
const href = link.href;
|
||||
if (!href) continue;
|
||||
// Extract pattern from URL path
|
||||
try {
|
||||
const path = new URL(href).pathname;
|
||||
const segments = path.split('/').filter(Boolean);
|
||||
if (segments.length >= 1) {
|
||||
const pattern = '/' + segments.slice(0, -1).join('/') + '/*';
|
||||
linkPatterns[pattern] = (linkPatterns[pattern] || 0) + 1;
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
|
||||
// Find most common list-like elements
|
||||
const listSelectors = [
|
||||
'ul li', 'ol li', 'div.parish', 'div.item', 'article',
|
||||
'tr', '.card', '.entry', '.listing', '.result',
|
||||
];
|
||||
|
||||
const selectorCounts: Record<string, number> = {};
|
||||
for (const sel of listSelectors) {
|
||||
selectorCounts[sel] = document.querySelectorAll(sel).length;
|
||||
}
|
||||
|
||||
return {
|
||||
title: document.title,
|
||||
totalLinks: links.length,
|
||||
linkPatterns: Object.entries(linkPatterns)
|
||||
.sort(([, a], [, b]) => b - a)
|
||||
.slice(0, 10),
|
||||
selectorCounts,
|
||||
bodyTextLength: document.body?.textContent?.length || 0,
|
||||
};
|
||||
});
|
||||
|
||||
console.log(`\nPage: ${analysis.title}`);
|
||||
console.log(`Total links: ${analysis.totalLinks}`);
|
||||
console.log(`\nMost common link patterns:`);
|
||||
for (const [pattern, count] of analysis.linkPatterns) {
|
||||
console.log(` ${pattern}: ${count} links`);
|
||||
}
|
||||
console.log(`\nElement counts:`);
|
||||
for (const [sel, count] of Object.entries(analysis.selectorCounts)) {
|
||||
if (count > 0) console.log(` ${sel}: ${count}`);
|
||||
}
|
||||
|
||||
// Ask for selectors
|
||||
console.log('\nNow configure CSS selectors for this diocese.\n');
|
||||
|
||||
const parishList = await ask('Parish list container selector (e.g. "ul.parishes li", ".parish-item"): ');
|
||||
const parishLink = await ask('Parish link selector within container (e.g. "a", "a.parish-link"): ');
|
||||
const parishName = await ask('Parish name selector (leave empty to use link text): ') || undefined;
|
||||
const parishAddress = await ask('Address selector (leave empty if none): ') || undefined;
|
||||
const parishCity = await ask('City selector (leave empty if none): ') || undefined;
|
||||
const pagination = await ask('Pagination "next" selector (leave empty if none): ') || undefined;
|
||||
const urlPatternStr = await ask('URL pattern regex (leave empty for all): ') || undefined;
|
||||
const waitForSelector = await ask('Wait for selector (leave empty if not needed): ') || undefined;
|
||||
|
||||
const scrapeConfig: DioceseScrapeConfig = {
|
||||
selectors: {
|
||||
parishList,
|
||||
parishLink,
|
||||
parishName,
|
||||
parishAddress,
|
||||
parishCity,
|
||||
pagination,
|
||||
},
|
||||
urlPattern: urlPatternStr,
|
||||
waitForSelector,
|
||||
maxPages: 50,
|
||||
scheduleInDirectory: false,
|
||||
};
|
||||
|
||||
// Test the config
|
||||
console.log('\nTesting selectors...');
|
||||
const testResults = await page.$$eval(
|
||||
parishList,
|
||||
(elements: Element[], linkSel: string) => {
|
||||
return elements.slice(0, 5).map(el => {
|
||||
const link = el.querySelector(linkSel);
|
||||
return {
|
||||
name: link?.textContent?.trim() || el.textContent?.trim()?.substring(0, 80) || '(empty)',
|
||||
url: link?.getAttribute('href') || '(no link)',
|
||||
};
|
||||
});
|
||||
},
|
||||
parishLink
|
||||
);
|
||||
|
||||
console.log(`\nTest extraction (first 5):`);
|
||||
for (const r of testResults) {
|
||||
console.log(` ${r.name}`);
|
||||
console.log(` -> ${r.url}`);
|
||||
}
|
||||
|
||||
const confirm = await ask('\nSave this configuration? (yes/no): ');
|
||||
if (confirm.toLowerCase() !== 'yes' && confirm.toLowerCase() !== 'y') {
|
||||
log('Cancelled.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Save to database
|
||||
const diocese = await prisma.diocese.create({
|
||||
data: {
|
||||
name,
|
||||
country,
|
||||
language,
|
||||
website: new URL(url).origin,
|
||||
directoryUrl: url,
|
||||
scrapeConfig: scrapeConfig as any,
|
||||
active: true,
|
||||
},
|
||||
});
|
||||
|
||||
log(`\nDiocese saved! ID: ${diocese.id}`);
|
||||
log(`Run: npx tsx scripts/scrape-diocese-directory.ts --diocese ${diocese.id} --dry-run`);
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
if (args.includes('--list')) {
|
||||
await listDioceses();
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const testIdx = args.indexOf('--test');
|
||||
if (testIdx !== -1) {
|
||||
await testDiocese(args[testIdx + 1]);
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const urlIdx = args.indexOf('--url');
|
||||
const countryIdx = args.indexOf('--country');
|
||||
const langIdx = args.indexOf('--language');
|
||||
|
||||
if (urlIdx === -1 || countryIdx === -1) {
|
||||
console.log('Usage:');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --url <directory-url> --country <CC> --language <lang>');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --list');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --test <diocese-id>');
|
||||
console.log('');
|
||||
console.log('Examples:');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr');
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const url = args[urlIdx + 1];
|
||||
const country = args[countryIdx + 1];
|
||||
const language = langIdx !== -1 ? args[langIdx + 1] : country.toLowerCase();
|
||||
|
||||
await setupDiocese(url, country, language);
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user