chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

View File

@@ -0,0 +1,165 @@
import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface CountryStats {
country: string;
totalChurches: number;
withWebsite: number;
withoutWebsite: number;
websitePercent: number;
needEnrichment: number;
priority: number;
}
async function analyzeEnrichmentPriority() {
try {
console.log('Analyzing enrichment priority by country...\n');
// Get all OSM churches grouped by country
const churches = await prisma.church.findMany({
where: {
source: 'osm',
},
select: {
country: true,
hasWebsite: true,
website: true,
},
});
// Group by country and calculate stats
const byCountry = churches.reduce((acc, church) => {
const country = church.country || 'Unknown';
if (!acc[country]) {
acc[country] = {
country,
totalChurches: 0,
withWebsite: 0,
withoutWebsite: 0,
websitePercent: 0,
needEnrichment: 0,
priority: 0,
};
}
acc[country].totalChurches++;
if (church.hasWebsite || church.website) {
acc[country].withWebsite++;
} else {
acc[country].withoutWebsite++;
acc[country].needEnrichment++;
}
return acc;
}, {} as Record<string, CountryStats>);
// Calculate percentages and priority score
const stats = Object.values(byCountry).map((stat) => {
stat.websitePercent = (stat.withWebsite / stat.totalChurches) * 100;
// Priority formula:
// - Weight heavily on churches needing enrichment (80%)
// - Weight on low website coverage (20%)
// This favors large countries with low coverage
const needWeight = stat.needEnrichment / 1000; // Normalize to thousands
const coverageGap = 100 - stat.websitePercent; // How much coverage is missing
stat.priority = needWeight * 0.8 + (coverageGap / 100) * needWeight * 0.2;
return stat;
});
// Sort by priority (highest first)
stats.sort((a, b) => b.priority - a.priority);
// Display results
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('ENRICHMENT PRIORITY RANKING');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
console.log('Priority formula: (churches_needing_enrichment * 0.8) + (coverage_gap * 0.2)');
console.log('This favors countries with many churches and low website coverage.');
console.log('');
console.log('Rank | Country | Total | Need Enrichment | Coverage | Priority Score');
console.log('─────┼─────────┼───────┼────────────────┼──────────┼────────────────');
stats.forEach((stat, index) => {
const rank = String(index + 1).padStart(4);
const country = stat.country.padEnd(7);
const total = String(stat.totalChurches).padStart(5);
const need = String(stat.needEnrichment).padStart(15);
const coverage = `${stat.websitePercent.toFixed(1)}%`.padStart(8);
const priority = stat.priority.toFixed(2).padStart(14);
console.log(`${rank} | ${country} | ${total} | ${need} | ${coverage} | ${priority}`);
});
console.log('');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
// Show top 10 with details
console.log('TOP 10 COUNTRIES TO PRIORITIZE:');
console.log('');
stats.slice(0, 10).forEach((stat, index) => {
console.log(`${index + 1}. ${stat.country}`);
console.log(` Total churches: ${stat.totalChurches.toLocaleString()}`);
console.log(` Need enrichment: ${stat.needEnrichment.toLocaleString()} (${(100 - stat.websitePercent).toFixed(1)}% missing)`);
console.log(` Current coverage: ${stat.websitePercent.toFixed(1)}%`);
console.log(` Priority score: ${stat.priority.toFixed(2)}`);
console.log('');
});
// Calculate enrichment timeline
const totalNeedEnrichment = stats.reduce((sum, s) => sum + s.needEnrichment, 0);
const daysAtFullSpeed = Math.ceil(totalNeedEnrichment / 390);
const monthsAtFullSpeed = (daysAtFullSpeed / 30).toFixed(1);
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('ENRICHMENT TIMELINE');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log(`Total churches needing enrichment: ${totalNeedEnrichment.toLocaleString()}`);
console.log(`At 390 churches/day (free tier): ${daysAtFullSpeed} days (~${monthsAtFullSpeed} months)`);
console.log('');
// Output country priority order for the script
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('COUNTRY PRIORITY ORDER (for enrichment script)');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
console.log('const COUNTRY_PRIORITY = [');
stats
.filter((s) => s.needEnrichment > 0)
.forEach((stat, index) => {
const comma = index < stats.filter((s) => s.needEnrichment > 0).length - 1 ? ',' : '';
console.log(` '${stat.country}'${comma} // ${stat.needEnrichment.toLocaleString()} churches`);
});
console.log('];');
console.log('');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
analyzeEnrichmentPriority();

View File

@@ -0,0 +1,66 @@
#!/usr/bin/env tsx
/**
* Check the 2 potentially real bugs
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function checkRealBugs() {
const scraper = new GenericScraper();
await scraper.init();
console.log('=== 1. Iglesia de San Fernando (trying Spanish page) ===\n');
scraper.setCountry('ES');
const spanishUrl = 'https://www.parroquiasanfernandomaspalomas.net/'; // Remove /de/
const result1 = await scraper.scrape(spanishUrl);
console.log(`URL: ${spanishUrl}`);
console.log(`Success: ${result1.success}`);
console.log(`Schedules: ${result1.schedules.length}`);
console.log(`Error: ${result1.error || 'none'}\n`);
if (result1.schedules.length > 0) {
console.log('Sample schedules:');
result1.schedules.slice(0, 5).forEach(s => {
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
});
}
console.log('\n=== 2. Kościół (Poland) ===\n');
scraper.setCountry('PL');
const result2 = await scraper.scrape('http://parafialubojna.pl');
console.log(`Success: ${result2.success}`);
console.log(`Schedules: ${result2.schedules.length}`);
console.log(`Error: ${result2.error || 'none'}\n`);
if (result2.schedules.length > 0) {
console.log('Sample schedules:');
result2.schedules.slice(0, 5).forEach(s => {
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
});
} else if (result2.rawHtml) {
const text = result2.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Look for Polish schedule keywords
const scheduleIndex = text.indexOf('msze') || text.indexOf('msza') || text.indexOf('nabożeńst');
if (scheduleIndex !== -1) {
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
console.log('Found schedule section:');
console.log(snippet);
}
}
await scraper.close();
}
checkRealBugs().catch(console.error);

View File

@@ -0,0 +1,79 @@
import { Pool } from 'pg';
import * as dotenv from 'dotenv';
import * as path from 'path';
// Load .env.local first (takes precedence), then .env
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
async function checkEnrichmentDetail() {
try {
console.log('Connecting to database...\n');
// Check churches awaiting enrichment
const pendingResult = await pool.query(`
SELECT
country,
COUNT(*) as pending_count
FROM churches
WHERE google_place_id IS NULL
GROUP BY country
ORDER BY pending_count DESC
LIMIT 20;
`);
console.log('=== Churches Awaiting Enrichment (Top 20 Countries) ===');
let totalPending = 0;
pendingResult.rows.forEach((row) => {
console.log(`${row.country}: ${row.pending_count} churches`);
totalPending += parseInt(row.pending_count);
});
console.log(`\nTotal pending shown: ${totalPending}`);
// Check total stats
const statsResult = await pool.query(`
SELECT
COUNT(*) as total_churches,
COUNT(CASE WHEN google_place_id IS NOT NULL THEN 1 END) as enriched,
COUNT(CASE WHEN google_place_id IS NULL THEN 1 END) as pending
FROM churches;
`);
console.log('\n=== Overall Stats ===');
console.log(`Total churches: ${statsResult.rows[0].total_churches}`);
console.log(`Enriched: ${statsResult.rows[0].enriched} (${((statsResult.rows[0].enriched / statsResult.rows[0].total_churches) * 100).toFixed(2)}%)`);
console.log(`Pending: ${statsResult.rows[0].pending} (${((statsResult.rows[0].pending / statsResult.rows[0].total_churches) * 100).toFixed(2)}%)`);
// Check enrichment rate
const rateResult = await pool.query(`
SELECT
DATE(updated_at) as date,
COUNT(*) as enriched_count
FROM churches
WHERE google_place_id IS NOT NULL
AND updated_at > NOW() - INTERVAL '7 days'
GROUP BY DATE(updated_at)
ORDER BY date DESC;
`);
console.log('\n=== Enrichment Activity (Last 7 Days) ===');
if (rateResult.rows.length === 0) {
console.log('No enrichment activity in the last 7 days');
} else {
rateResult.rows.forEach((row) => {
console.log(`${row.date}: ${row.enriched_count} churches`);
});
}
} catch (error) {
console.error('Error checking enrichment detail:', error);
} finally {
await pool.end();
}
}
checkEnrichmentDetail();

View File

@@ -0,0 +1,146 @@
import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function checkEnrichmentStatus() {
try {
console.log('Checking enrichment status...\n');
// Overall stats
const totalOSM = await prisma.church.count({
where: { source: 'osm' },
});
const enriched = await prisma.church.count({
where: {
source: 'osm',
googlePlaceId: { not: null },
},
});
const withWebsite = await prisma.church.count({
where: {
source: 'osm',
hasWebsite: true,
},
});
const needEnrichment = await prisma.church.count({
where: {
source: 'osm',
hasWebsite: false,
website: null,
},
});
// Recently enriched (last 24 hours)
const yesterday = new Date();
yesterday.setDate(yesterday.getDate() - 1);
const recentlyEnriched = await prisma.church.count({
where: {
source: 'osm',
googlePlaceId: { not: null },
updatedAt: { gte: yesterday },
},
});
// Get top 10 priority countries status
const PRIORITY_COUNTRIES = ['FR', 'DE', 'ES', 'PL', 'BR', 'PT', 'PH', 'CZ', 'MX', 'HU'];
console.log('═══════════════════════════════════════════════════════════════');
console.log('OVERALL ENRICHMENT STATUS');
console.log('═══════════════════════════════════════════════════════════════');
console.log(`Total OSM churches: ${totalOSM.toLocaleString()}`);
console.log(`Churches with Google Place ID: ${enriched.toLocaleString()} (${((enriched / totalOSM) * 100).toFixed(2)}%)`);
console.log(`Churches with websites: ${withWebsite.toLocaleString()} (${((withWebsite / totalOSM) * 100).toFixed(2)}%)`);
console.log(`Need enrichment: ${needEnrichment.toLocaleString()} (${((needEnrichment / totalOSM) * 100).toFixed(2)}%)`);
console.log('');
console.log(`Recently enriched (24h): ${recentlyEnriched.toLocaleString()}`);
console.log('');
// Priority countries breakdown
console.log('═══════════════════════════════════════════════════════════════');
console.log('TOP 10 PRIORITY COUNTRIES STATUS');
console.log('═══════════════════════════════════════════════════════════════');
console.log('');
for (const country of PRIORITY_COUNTRIES) {
const total = await prisma.church.count({
where: { source: 'osm', country },
});
const countryEnriched = await prisma.church.count({
where: {
source: 'osm',
country,
googlePlaceId: { not: null },
},
});
const countryWithWebsite = await prisma.church.count({
where: {
source: 'osm',
country,
OR: [
{ hasWebsite: true },
{ googlePlaceId: { not: null } },
],
},
});
const countryNeedEnrichment = await prisma.church.count({
where: {
source: 'osm',
country,
hasWebsite: false,
website: null,
},
});
const websitePercent = (countryWithWebsite / total) * 100;
const enrichedPercent = (countryEnriched / total) * 100;
console.log(`${country.padEnd(4)} | Total: ${String(total).padStart(6)} | Enriched: ${String(countryEnriched).padStart(5)} (${enrichedPercent.toFixed(1)}%) | With Website: ${String(countryWithWebsite).padStart(5)} (${websitePercent.toFixed(1)}%) | Need: ${String(countryNeedEnrichment).padStart(6)}`);
}
console.log('');
// Estimate timeline
const daysRemaining = Math.ceil(needEnrichment / 390);
const monthsRemaining = (daysRemaining / 30).toFixed(1);
console.log('═══════════════════════════════════════════════════════════════');
console.log('TIMELINE ESTIMATE');
console.log('═══════════════════════════════════════════════════════════════');
console.log(`At 390 churches/day:`);
console.log(` Days remaining: ${daysRemaining} days`);
console.log(` Months remaining: ~${monthsRemaining} months`);
console.log(` Estimated completion: ${new Date(Date.now() + daysRemaining * 24 * 60 * 60 * 1000).toLocaleDateString()}`);
console.log('');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
checkEnrichmentStatus();

View File

@@ -0,0 +1,78 @@
import { Pool } from 'pg';
import * as dotenv from 'dotenv';
import * as path from 'path';
// Load .env.local first (takes precedence), then .env
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
async function checkEnrichment() {
try {
console.log('Connecting to database...');
// Check total enriched churches
const totalResult = await pool.query(`
SELECT
COUNT(*) as total_enriched,
COUNT(CASE WHEN updated_at > NOW() - INTERVAL '24 hours' THEN 1 END) as enriched_last_24h,
MAX(updated_at) as last_enrichment
FROM churches
WHERE google_place_id IS NOT NULL;
`);
console.log('\n=== Google Enrichment Summary ===');
console.log(`Total churches with Google Place ID: ${totalResult.rows[0].total_enriched}`);
console.log(`Enriched in last 24 hours: ${totalResult.rows[0].enriched_last_24h}`);
console.log(`Last enrichment: ${totalResult.rows[0].last_enrichment}`);
// Check by country
const countryResult = await pool.query(`
SELECT
country,
COUNT(*) as enriched_count,
COUNT(CASE WHEN updated_at > NOW() - INTERVAL '24 hours' THEN 1 END) as enriched_last_24h
FROM churches
WHERE google_place_id IS NOT NULL
GROUP BY country
ORDER BY enriched_last_24h DESC
LIMIT 10;
`);
console.log('\n=== Top Countries Enriched (Last 24h) ===');
countryResult.rows.forEach((row) => {
console.log(`${row.country}: ${row.enriched_last_24h} new / ${row.enriched_count} total`);
});
// Check recent enrichments with details
const recentResult = await pool.query(`
SELECT
name,
city,
country,
google_place_id,
updated_at
FROM churches
WHERE google_place_id IS NOT NULL
AND updated_at > NOW() - INTERVAL '24 hours'
ORDER BY updated_at DESC
LIMIT 20;
`);
console.log('\n=== Recent Enrichments (Last 24h, sample) ===');
recentResult.rows.forEach((row) => {
const timestamp = row.updated_at ? new Date(row.updated_at).toISOString() : 'unknown';
console.log(`${row.name}, ${row.city}, ${row.country} - ${timestamp}`);
});
} catch (error) {
console.error('Error checking enrichment:', error);
} finally {
await pool.end();
}
}
checkEnrichment();

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env tsx
/**
* Check the full section text for German church to understand office hours pattern
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function checkGerman() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find Monday section
const montagIndex = text.indexOf('montag');
if (montagIndex !== -1) {
const montagContext = text.substring(montagIndex, montagIndex + 200);
console.log('=== Monday (Montag) context ===');
console.log(montagContext);
console.log('');
}
// Find Sunday section
const sonntagIndex = text.indexOf('sonntag');
if (sonntagIndex !== -1) {
const sonntagContext = text.substring(sonntagIndex, sonntagIndex + 300);
console.log('=== Sunday (Sonntag) context ===');
console.log(sonntagContext);
console.log('');
}
}
await scraper.close();
}
checkGerman().catch(console.error);

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env tsx
import { config } from 'dotenv';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
// Load environment variables
config({ path: '.env.local' });
config({ path: '.env' });
async function main() {
const connectionString = process.env.DATABASE_URL || '';
console.log('DATABASE_URL:', connectionString.replace(/:[^:@]+@/, ':****@'));
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
console.log('PrismaClient created:', !!prisma);
console.log('prisma.churches:', !!prisma.churches);
await prisma.$connect();
const count = await prisma.churches.count({ where: { country: 'PL' } });
console.log(`Poland churches in Neon: ${count}`);
const withSchedules = await prisma.churches.count({
where: {
country: 'PL',
massSchedules: { some: {} }
}
});
console.log(`With mass schedules: ${withSchedules}`);
// Sample a few churches
const sample = await prisma.churches.findMany({
where: { country: 'PL' },
include: { massSchedules: true },
take: 3
});
console.log('\nSample churches:');
for (const church of sample) {
console.log(` - ${church.name} (${church.city}): ${church.massSchedules.length} schedules`);
}
await prisma.$disconnect();
await pool.end();
}
main().catch(console.error);

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function check() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('PL');
const result = await scraper.scrape('http://parafialubojna.pl');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
const niedziela_matches = [];
let idx = 0;
while ((idx = text.indexOf('niedziela', idx)) !== -1) {
niedziela_matches.push({
position: idx,
context: text.substring(Math.max(0, idx-30), idx+70)
});
idx++;
}
console.log(`niedziela occurrences: ${niedziela_matches.length}\n`);
niedziela_matches.forEach((m, i) => {
console.log(`Occurrence ${i+1} at position ${m.position}:`);
console.log(` "${m.context}"`);
console.log('');
});
}
await scraper.close();
}
check();

View File

@@ -0,0 +1,34 @@
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function main() {
const totalRes = await pool.query(`SELECT COUNT(*) as total FROM churches WHERE source = 'osm'`);
console.log('Total OSM churches:', totalRes.rows[0].total);
const countryRes = await pool.query(`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country ORDER BY count DESC LIMIT 40`);
console.log('\nTop 40 countries by OSM church count:');
for (const row of countryRes.rows) {
console.log(` ${row.country}: ${row.count}`);
}
// Check key countries that were under-imported
const keyCountries = ['AT','HR','UA','RO','LV','BY','RS','BA','MK','AL','EE','GE','AM','RU','IN','JP','CA','US','MX','AR','CO','ID','CN'];
const keyRes = await pool.query(`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country = ANY($1) GROUP BY country ORDER BY count DESC`, [keyCountries]);
console.log('\nKey countries to check (were under-imported):');
const found = new Map(keyRes.rows.map((r: any) => [r.country, r.count]));
for (const c of keyCountries) {
console.log(` ${c}: ${found.get(c) || 0}`);
}
// Total countries
const countriesRes = await pool.query(`SELECT COUNT(DISTINCT country) as total FROM churches WHERE source = 'osm'`);
console.log(`\nTotal countries with OSM data: ${countriesRes.rows[0].total}`);
await pool.end();
}
main();

View File

@@ -0,0 +1,88 @@
#!/usr/bin/env tsx
/**
* Check production database (Neon) for data
* Run with: npx tsx scripts/check-production-db.ts
*/
import { Pool } from 'pg';
import { config } from 'dotenv';
// Load environment variables (.env.local overrides .env)
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
console.error('❌ DATABASE_URL not found in environment');
process.exit(1);
}
console.log('🔍 Checking production database...');
console.log('📍 Connection:', connectionString.includes('neon.tech') ? 'Neon (Production)' : 'localhost');
const pool = new Pool({ connectionString });
async function checkDatabase() {
try {
// Test connection
console.log('\n1⃣ Testing database connection...');
await pool.query('SELECT NOW()');
console.log('✅ Database connection successful');
// Check tables exist
console.log('\n2⃣ Checking tables...');
const tablesResult = await pool.query(`
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_name
`);
console.log(`✅ Found ${tablesResult.rows.length} tables:`, tablesResult.rows.map(r => r.table_name).join(', '));
// Check churches
console.log('\n3⃣ Checking churches...');
const churchCount = await pool.query('SELECT COUNT(*) FROM "churches"');
console.log(`📊 Churches: ${churchCount.rows[0].count}`);
if (parseInt(churchCount.rows[0].count) > 0) {
const sampleChurch = await pool.query('SELECT id, name, city, state, latitude, longitude FROM "churches" LIMIT 1');
console.log('📍 Sample church:', sampleChurch.rows[0]);
} else {
console.log('⚠️ No churches found in database!');
}
// Check mass schedules
console.log('\n4⃣ Checking mass schedules...');
const massCount = await pool.query('SELECT COUNT(*) FROM "mass_schedules"');
console.log(`📊 Mass schedules: ${massCount.rows[0].count}`);
// Check liturgical days
console.log('\n5⃣ Checking liturgical days...');
const liturgicalCount = await pool.query('SELECT COUNT(*) FROM "liturgical_days"');
console.log(`📊 Liturgical days: ${liturgicalCount.rows[0].count}`);
// Check today's liturgical data
const today = new Date().toISOString().split('T')[0];
const todayData = await pool.query(
'SELECT * FROM "liturgical_days" WHERE date = $1',
[today]
);
if (todayData.rows.length > 0) {
console.log(`✅ Today's liturgical data exists:`, todayData.rows[0].season);
} else {
console.log(`⚠️ No liturgical data for today (${today})`);
}
console.log('\n✨ Database check complete!\n');
} catch (error) {
console.error('❌ Error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
checkDatabase();

View File

@@ -0,0 +1,164 @@
import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function checkScraperStatus() {
try {
console.log('Checking mass schedule scraper status...\n');
// Overall church stats
const totalChurches = await prisma.church.count();
const churchesWithWebsites = await prisma.church.count({
where: {
OR: [
{ website: { not: null } },
{ massScheduleUrl: { not: null } },
],
},
});
const churchesScraped = await prisma.church.count({
where: { lastScrapedAt: { not: null } },
});
// Mass schedule stats
const totalMassSchedules = await prisma.massSchedule.count();
const churchesWithSchedules = await prisma.church.count({
where: {
massSchedules: {
some: {},
},
},
});
// Recently scraped (last 7 days)
const weekAgo = new Date();
weekAgo.setDate(weekAgo.getDate() - 7);
const recentlyScraped = await prisma.church.count({
where: {
lastScrapedAt: { gte: weekAgo },
},
});
// Get scraper sources
const bySource = await prisma.church.groupBy({
by: ['source'],
_count: {
id: true,
},
});
console.log('═══════════════════════════════════════════════════════════════');
console.log('CHURCH DATA SOURCES');
console.log('═══════════════════════════════════════════════════════════════');
bySource.forEach((source) => {
const percent = ((source._count.id / totalChurches) * 100).toFixed(1);
console.log(`${source.source.padEnd(12)} | ${String(source._count.id).padStart(7)} churches (${percent}%)`);
});
console.log('');
console.log('═══════════════════════════════════════════════════════════════');
console.log('MASS SCHEDULE SCRAPING STATUS');
console.log('═══════════════════════════════════════════════════════════════');
console.log(`Total churches: ${totalChurches.toLocaleString()}`);
console.log(`Churches with websites: ${churchesWithWebsites.toLocaleString()} (${((churchesWithWebsites / totalChurches) * 100).toFixed(1)}%)`);
console.log(`Churches ever scraped: ${churchesScraped.toLocaleString()} (${((churchesScraped / totalChurches) * 100).toFixed(1)}%)`);
console.log(`Churches with mass schedules: ${churchesWithSchedules.toLocaleString()} (${((churchesWithSchedules / totalChurches) * 100).toFixed(1)}%)`);
console.log(`Total mass schedules: ${totalMassSchedules.toLocaleString()}`);
console.log('');
console.log(`Scraped in last 7 days: ${recentlyScraped.toLocaleString()}`);
console.log('');
// Average schedules per church
if (churchesWithSchedules > 0) {
const avgSchedules = totalMassSchedules / churchesWithSchedules;
console.log(`Average schedules per church: ${avgSchedules.toFixed(1)} masses/week`);
console.log('');
}
// Get sample of recently scraped churches
const recentSample = await prisma.church.findMany({
where: {
lastScrapedAt: { not: null },
},
select: {
name: true,
city: true,
state: true,
country: true,
lastScrapedAt: true,
website: true,
source: true,
_count: {
select: {
massSchedules: true,
},
},
},
orderBy: { lastScrapedAt: 'desc' },
take: 10,
});
console.log('═══════════════════════════════════════════════════════════════');
console.log('RECENTLY SCRAPED CHURCHES (Last 10)');
console.log('═══════════════════════════════════════════════════════════════');
if (recentSample.length === 0) {
console.log('No churches have been scraped yet.');
} else {
recentSample.forEach((church, index) => {
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
console.log(`${index + 1}. ${church.name} (${location})`);
console.log(` Source: ${church.source}`);
console.log(` Website: ${church.website || 'None'}`);
console.log(` Last scraped: ${church.lastScrapedAt?.toLocaleString() || 'Never'}`);
console.log(` Mass schedules: ${church._count.massSchedules}`);
console.log('');
});
}
// Churches ready to scrape (have website, not scraped)
const readyToScrape = await prisma.church.count({
where: {
OR: [
{ website: { not: null } },
{ massScheduleUrl: { not: null } },
],
lastScrapedAt: null,
},
});
console.log('═══════════════════════════════════════════════════════════════');
console.log('SCRAPING POTENTIAL');
console.log('═══════════════════════════════════════════════════════════════');
console.log(`Churches ready to scrape: ${readyToScrape.toLocaleString()}`);
console.log(` (have website, never scraped)`);
console.log('');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
checkScraperStatus();

View File

@@ -0,0 +1,47 @@
import { Pool } from 'pg';
async function getColumns(pool: Pool, table: string) {
const result = await pool.query(
`SELECT column_name, data_type FROM information_schema.columns WHERE table_name = $1 ORDER BY ordinal_position`,
[table]
);
return result.rows;
}
async function run() {
const nas = new Pool({ connectionString: 'postgresql://postgres:postgres@192.168.0.145:5434/nearestmass' });
const neon = new Pool({
connectionString: 'postgresql://neondb_owner:npg_sX8dxFg9KZIR@ep-plain-sky-ah15xa97-pooler.c-3.us-east-1.aws.neon.tech/neondb?sslmode=require',
ssl: { rejectUnauthorized: false },
});
for (const table of ['churches', 'mass_schedules', 'confession_schedules', 'adoration_schedules']) {
const nasCols = await getColumns(nas, table);
const neonCols = await getColumns(neon, table);
const nasNames = new Set(nasCols.map((c) => c.column_name));
const neonNames = new Set(neonCols.map((c) => c.column_name));
const onlyNas = nasCols.filter((c) => !neonNames.has(c.column_name));
const onlyNeon = neonCols.filter((c) => !nasNames.has(c.column_name));
if (onlyNas.length > 0 || onlyNeon.length > 0) {
console.log(`\n=== ${table} ===`);
if (onlyNas.length) {
console.log(' NAS only:');
for (const c of onlyNas) console.log(` - ${c.column_name} (${c.data_type})`);
}
if (onlyNeon.length) {
console.log(' Neon only:');
for (const c of onlyNeon) console.log(` - ${c.column_name} (${c.data_type})`);
}
} else {
console.log(`\n=== ${table} === (schemas match)`);
}
}
await nas.end();
await neon.end();
}
run();

View File

@@ -0,0 +1,48 @@
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function main() {
const c = await pool.connect();
const total = await c.query('SELECT count(*) FROM "Church"');
console.log('\n=== DATABASE OVERVIEW ===');
console.log('Churches total:', Number(total.rows[0].count).toLocaleString());
const withWebsite = await c.query('SELECT count(*) FROM "Church" WHERE website IS NOT NULL');
console.log('With website:', Number(withWebsite.rows[0].count).toLocaleString());
const withSchedules = await c.query('SELECT count(DISTINCT "churchId") FROM "MassSchedule"');
console.log('With mass schedules:', Number(withSchedules.rows[0].count).toLocaleString());
const enrichedGoogle = await c.query('SELECT count(*) FROM "Church" WHERE "googlePlaceId" IS NOT NULL');
console.log('Google Places enriched:', Number(enrichedGoogle.rows[0].count).toLocaleString());
const totalSchedules = await c.query('SELECT count(*) FROM "MassSchedule"');
console.log('Total mass schedules:', Number(totalSchedules.rows[0].count).toLocaleString());
const countries = await c.query('SELECT country, count(*) as cnt FROM "Church" GROUP BY country ORDER BY cnt DESC LIMIT 15');
console.log('\n=== TOP COUNTRIES ===');
for (const r of countries.rows) console.log(' ' + (r.country || '(null)') + ':', Number(r.cnt).toLocaleString());
const sources = await c.query('SELECT source, count(*) as cnt FROM "Church" GROUP BY source ORDER BY cnt DESC LIMIT 10');
console.log('\n=== CHURCH SOURCES ===');
for (const r of sources.rows) console.log(' ' + (r.source || '(null)') + ':', Number(r.cnt).toLocaleString());
const lastScrape = await c.query('SELECT "lastScrapedAt" FROM "Church" WHERE "lastScrapedAt" IS NOT NULL ORDER BY "lastScrapedAt" DESC LIMIT 1');
console.log('\n=== LAST SCRAPE ===');
console.log(lastScrape.rows[0]?.lastScrapedAt || 'No scrapes yet');
const jobs = await c.query('SELECT status, count(*) as cnt FROM "ScrapeJob" GROUP BY status ORDER BY cnt DESC');
console.log('\n=== JOB STATUS ===');
for (const r of jobs.rows) console.log(' ' + r.status + ':', Number(r.cnt).toLocaleString());
const schedulesByLang = await c.query('SELECT language, count(*) as cnt FROM "MassSchedule" GROUP BY language ORDER BY cnt DESC LIMIT 10');
console.log('\n=== SCHEDULES BY LANGUAGE ===');
for (const r of schedulesByLang.rows) console.log(' ' + (r.language || '(null)') + ':', Number(r.cnt).toLocaleString());
c.release();
await pool.end();
}
main().catch(e => { console.error(e.message); process.exit(1); });

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env tsx
/**
* Debug a specific French page to see why scraping failed
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function debugPage() {
const url = 'https://www.chemin-neuf.fr/'; // Last failed church
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
const result = await scraper.scrape(url);
console.log(`Success: ${result.success}`);
console.log(`Schedules found: ${result.schedules.length}`);
if (result.error) console.log(`Error: ${result.error}`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
console.log('\n=== Page Text Sample (first 2000 chars) ===');
console.log(text.substring(0, 2000));
console.log('\n');
// Check for French day names
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
console.log('=== French day names found ===');
for (const day of frenchDays) {
if (text.includes(day)) {
console.log(`✓ Found: ${day}`);
}
}
// Check for time patterns
console.log('\n=== Time patterns (sample) ===');
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:AM|PM|am|pm|Uhr|uur|h)?/g;
const times = text.match(timeRegex);
if (times) {
console.log(`Found ${times.length} time-like patterns:`);
console.log(times.slice(0, 20).join(', '));
} else {
console.log('No time patterns found');
}
}
await scraper.close();
}
debugPage().catch(console.error);

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env tsx
/**
* Debug why German church has duplicate schedules
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
// Temporarily patch GenericScraper to log sections
const originalParse = GenericScraper.prototype['parseSchedules'];
GenericScraper.prototype['parseSchedules'] = function(html: string) {
const text = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Call findScheduleSections and log result
const sections = this['findScheduleSections'](text);
console.log('\n=== Sections found ===\n');
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
sections.forEach((section: any, i: number) => {
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
console.log(` Text preview: "${section.text.substring(0, 100)}..."`);
});
console.log(`\nTotal sections: ${sections.length}\n`);
// Continue with normal processing
const result = originalParse.call(this, html);
console.log(`\n=== Extracted times per section ===\n`);
const schedsByDay: Record<number, typeof result> = {};
for (const sched of result) {
if (!schedsByDay[sched.dayOfWeek]) schedsByDay[sched.dayOfWeek] = [];
schedsByDay[sched.dayOfWeek].push(sched);
}
for (let i = 0; i < 7; i++) {
if (schedsByDay[i]) {
console.log(`${dayNames[i]}: ${schedsByDay[i].map(s => s.time).join(', ')}`);
}
}
return result;
};
async function testGerman() {
const url = 'https://www.alterpeter.de/';
console.log(`Testing: ${url}`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape(url);
console.log(`\n=== Final Result ===`);
console.log(`Success: ${result.success}`);
console.log(`Total schedules: ${result.schedules.length}`);
await scraper.close();
}
testGerman().catch(console.error);

View File

@@ -0,0 +1,44 @@
import { chromium } from 'playwright';
async function main() {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
const url = 'https://masstimes.org/search?lat=32.7765&lng=-79.9311&type=parish';
console.log('Loading:', url);
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
// Wait for Angular to render
await page.waitForTimeout(5000);
// Take screenshot
await page.screenshot({ path: '/tmp/masstimes-debug.png', fullPage: true });
console.log('Screenshot saved to /tmp/masstimes-debug.png');
// Get page HTML
const html = await page.content();
console.log('\n--- PAGE HTML (first 5000 chars) ---\n');
console.log(html.substring(0, 5000));
// Try to find any visible text that looks like church names
const visibleText = await page.evaluate(() => {
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
const texts: string[] = [];
let node;
while ((node = walker.nextNode())) {
const text = node.textContent?.trim();
if (text && text.length > 10 && text.length < 100) {
texts.push(text);
}
}
return texts.slice(0, 50);
});
console.log('\n--- VISIBLE TEXT SNIPPETS ---\n');
visibleText.forEach((t, i) => console.log(`${i + 1}. ${t}`));
await browser.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,74 @@
#!/usr/bin/env tsx
/**
* Deep dive into Paróquia da Paz parsing bug
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function debugPaz() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
console.log(`Success: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}\n`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find where days appear
console.log('=== Finding day + time patterns ===\n');
const days = ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'];
for (const day of days) {
const dayIndex = text.indexOf(day);
if (dayIndex !== -1) {
// Show context around the day (100 chars before and 200 after)
const before = Math.max(0, dayIndex - 100);
const after = Math.min(text.length, dayIndex + 200);
const snippet = text.substring(before, after);
console.log(`${day.toUpperCase()}:`);
console.log(` Position: ${dayIndex}`);
console.log(` Context: ...${snippet}...`);
console.log('');
}
}
// Check for "h" time format specifically
console.log('\n=== Checking "h" time format ===');
const hTimeRegex = /(\d{1,2})h(\d{2})?/g;
const hTimes = text.match(hTimeRegex);
if (hTimes) {
console.log(`Found ${hTimes.length} "h" format times:`);
console.log(hTimes.slice(0, 30).join(', '));
}
// Look for schedule structure
console.log('\n=== Looking for schedule structure ===');
const scheduleKeywords = ['horário', 'horario', 'missa', 'missas', 'santa missa'];
for (const keyword of scheduleKeywords) {
const index = text.indexOf(keyword);
if (index !== -1) {
const snippet = text.substring(index, Math.min(text.length, index + 500));
console.log(`\nFound "${keyword}" at position ${index}:`);
console.log(snippet.substring(0, 300));
}
}
}
await scraper.close();
}
debugPaz().catch(console.error);

View File

@@ -0,0 +1,150 @@
#!/usr/bin/env tsx
/**
* Debug the 5 parsing bugs identified in top 5 test
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
// The churches with parsing bugs
const BUG_CHURCHES = [
{ name: 'St. Marien', country: 'DE', searchTerm: 'St. Marien' },
{ name: 'Santuario de Manalagua', country: 'ES', searchTerm: 'Santuario de Manalagua' },
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', searchTerm: 'Najświętszego Serca Pana Jez' },
{ name: 'Paróquia de Nossa Senhora do Desterro', country: 'BR', searchTerm: 'Nossa Senhora do Desterro' },
{ name: 'Paróquia da Paz', country: 'BR', searchTerm: 'Paróquia da Paz' },
];
async function debugBugs() {
console.log('Debugging parsing bugs...\n');
const scraper = new GenericScraper();
await scraper.init();
for (const bug of BUG_CHURCHES) {
console.log('═'.repeat(80));
console.log(`BUG: ${bug.name} (${bug.country})`);
console.log('═'.repeat(80));
const church = await prisma.church.findFirst({
where: {
country: bug.country,
name: { contains: bug.searchTerm },
website: { not: null },
},
});
if (!church) {
console.log(`❌ Church not found in database\n`);
continue;
}
console.log(`Church: ${church.name}`);
console.log(`URL: ${church.website}\n`);
scraper.setCountry(bug.country);
try {
const result = await scraper.scrape(church.website!);
console.log(`Success: ${result.success}`);
console.log(`Schedules found: ${result.schedules.length}`);
if (result.error) console.log(`Error: ${result.error}`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
console.log('\n--- Text Sample (first 1000 chars) ---');
console.log(text.substring(0, 1000));
// Check for day names
console.log('\n--- Day Names Found ---');
const dayPatterns: Record<string, string[]> = {
DE: ['sonntag', 'montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag'],
ES: ['domingo', 'lunes', 'martes', 'miércoles', 'miercoles', 'jueves', 'viernes', 'sábado', 'sabado'],
PL: ['niedziela', 'poniedziałek', 'poniedzialek', 'wtorek', 'środa', 'sroda', 'czwartek', 'piątek', 'piatek', 'sobota'],
BR: ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'],
};
const days = dayPatterns[bug.country] || [];
const foundDays: string[] = [];
for (const day of days) {
if (text.includes(day)) {
foundDays.push(day);
}
}
console.log(`Found: ${foundDays.join(', ') || 'none'}`);
// Check for time patterns
console.log('\n--- Time Patterns Found ---');
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h|uhr)?/gi;
const times = text.match(timeRegex);
if (times) {
const uniqueTimes = [...new Set(times)].slice(0, 20);
console.log(`Found ${times.length} time patterns (showing first 20 unique):`);
console.log(uniqueTimes.join(', '));
} else {
console.log('No time patterns found');
}
// Look for specific mass schedule keywords
console.log('\n--- Mass Schedule Keywords ---');
const keywords: Record<string, string[]> = {
DE: ['gottesdienst', 'messe', 'heilige messe', 'messzeiten'],
ES: ['misa', 'horario', 'eucaristía', 'eucaristia'],
PL: ['msza', 'msze', 'nabożeństwo', 'nabozenstwo'],
BR: ['missa', 'horário', 'horario', 'eucaristia'],
};
const countryKeywords = keywords[bug.country] || [];
const foundKeywords: string[] = [];
for (const keyword of countryKeywords) {
if (text.includes(keyword)) {
foundKeywords.push(keyword);
}
}
console.log(`Found: ${foundKeywords.join(', ') || 'none'}`);
// Look for specific problematic patterns
console.log('\n--- Looking for edge cases ---');
// Check if times and days are separated (not in same section)
const hasTimeBeforeDays = text.indexOf(foundDays[0] || 'zzz') > text.indexOf((times || [])[0] || 'aaa');
console.log(`Times come before days: ${hasTimeBeforeDays ? 'YES (potential issue)' : 'no'}`);
// Check for table structures
const hasTables = text.includes('colspan') || text.includes('rowspan') || (text.match(/\s+\|\s+/g)?.length || 0) > 5;
console.log(`Likely table format: ${hasTables ? 'YES (may need special handling)' : 'no'}`);
// Check for multiple languages on same page
const hasMultiLang = (text.match(/english|español|espanol|portuguese|português|portugues|deutsch|polski/gi)?.length || 0) > 1;
console.log(`Multiple languages: ${hasMultiLang ? 'YES (may confuse parser)' : 'no'}`);
}
console.log('\n');
} catch (err: any) {
console.log(`❌ ERROR: ${err.message}\n`);
}
}
await scraper.close();
await prisma.$disconnect();
await pool.end();
}
debugBugs().catch(console.error);

View File

@@ -0,0 +1,98 @@
#!/usr/bin/env tsx
/**
* Debug the full parsing flow with section detection
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
async function debugFullFlow() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
if (!result.rawHtml) {
console.log('No HTML received');
await scraper.close();
return;
}
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find the schedule section
const scheduleIndex = text.indexOf('segundas, terças');
if (scheduleIndex === -1) {
console.log('Schedule text not found!');
await scraper.close();
return;
}
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
console.log('Schedule snippet from actual HTML:');
console.log(snippet);
console.log('\n');
// Now test section matching on actual text
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing sábados and domingos matches ===\n');
// Test sábados
const sabadosRegex = new RegExp(
`(?:^|\\s|[,;:])sábados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const sabadosMatch = snippet.match(sabadosRegex);
console.log('sábados match:', sabadosMatch ? `Found: "${sabadosMatch[1].substring(0, 50)}"` : 'Not found');
// Test sabados (no accent)
const sabadosRegex2 = new RegExp(
`(?:^|\\s|[,;:])sabados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const sabadosMatch2 = snippet.match(sabadosRegex2);
console.log('sabados match:', sabadosMatch2 ? `Found: "${sabadosMatch2[1].substring(0, 50)}"` : 'Not found');
// Test domingos
const domingosRegex = new RegExp(
`(?:^|\\s|[,;:])domingos[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const domingosMatch = snippet.match(domingosRegex);
console.log('domingos match:', domingosMatch ? `Found: "${domingosMatch[1].substring(0, 50)}"` : 'Not found');
console.log('\n=== Final parsed schedules ===\n');
console.log(`Total: ${result.schedules.length}`);
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
console.log(`${dayNames[i]}: ${byDay[i].length} schedules`);
} else {
console.log(`${dayNames[i]}: 0 schedules ❌`);
}
}
await scraper.close();
}
debugFullFlow().catch(console.error);

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env tsx
/**
* Debug which sections are being found
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Simulate the exact text from the page
const scheduleText = `
horário das missas igreja matriz de santo antônio
segundas, terças, quartas e sextas-feiras: 16h e 18h.
quintas-feiras: 16h e 19h (adoração ao santíssimo 18h).
sábados: 8h, 16h e 18h.
domingos: 8h, 11h, 16h, 18h e 20h.
`.toLowerCase();
console.log('Text to parse:');
console.log(scheduleText);
console.log('');
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== COMMA-SEPARATED GROUP MATCHING ===\n');
const dayGroupRegex = new RegExp(
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
'gi'
);
let groupMatch;
let matchCount = 0;
while ((groupMatch = dayGroupRegex.exec(scheduleText)) !== null) {
matchCount++;
console.log(`Match #${matchCount}:`);
console.log(` Day group: "${groupMatch[1]}"`);
console.log(` Time text: "${groupMatch[2]}"`);
console.log('');
}
console.log('=== INDIVIDUAL DAY MATCHING ===\n');
for (const [dayName, dayIndex] of Object.entries(dayPatterns)) {
const escaped = dayName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = scheduleText.match(regex);
if (match) {
console.log(`Found ${dayName} (day ${dayIndex}):`);
console.log(` Time text: "${match[1].substring(0, 100)}"`);
}
}

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env tsx
/**
* Debug Paróquia da Paz with added logging
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
async function debugPazWithLogging() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
console.log(`Success: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}\n`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Test the regex pattern manually
console.log('=== Testing comma-separated day grouping regex ===\n');
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('Day patterns:', Object.keys(dayPatterns).join(', '));
console.log('');
// The exact regex from the code
const dayGroupRegex = new RegExp(
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
'gi'
);
console.log('Regex pattern:', dayGroupRegex.source.substring(0, 200) + '...\n');
let groupMatch;
let matchCount = 0;
while ((groupMatch = dayGroupRegex.exec(text)) !== null) {
matchCount++;
console.log(`Match #${matchCount}:`);
console.log(` Full match: "${groupMatch[0].substring(0, 100)}"`);
console.log(` Day group: "${groupMatch[1]}"`);
console.log(` Time text: "${groupMatch[2].substring(0, 50)}"`);
console.log('');
}
if (matchCount === 0) {
console.log('No matches found!\n');
// Try to find the schedule text manually
const scheduleIndex = text.indexOf('segundas, terças');
if (scheduleIndex !== -1) {
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
console.log('Found schedule text at position', scheduleIndex);
console.log('Snippet:', snippet);
console.log('');
// Test if individual day names are matching
console.log('Testing individual day name matches in snippet:');
for (const dayName of sortedDayNames.slice(0, 10)) {
if (snippet.includes(dayName)) {
console.log(` ✓ Found: ${dayName}`);
}
}
}
}
}
await scraper.close();
}
debugPazWithLogging().catch(console.error);

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env tsx
/**
* Debug Polish church in detail
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
async function debugPolish() {
const url = 'http://parafialubojna.pl';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('PL');
const result = await scraper.scrape(url);
console.log(`Success: ${result.success}`);
console.log(`Schedules found: ${result.schedules.length}\n`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find the schedule section
const scheduleIndex = text.indexOf('msze święte') || text.indexOf('msze swiete');
if (scheduleIndex !== -1) {
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
console.log('Schedule section:');
console.log(snippet);
console.log('\n');
// Test all time pattern matches
console.log('=== Testing time pattern matches ===\n');
// Space separator pattern
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
const spaceMatches = snippet.match(spacePattern);
console.log('Space-separated times (8 00, 9 30):');
console.log(spaceMatches ? spaceMatches.join(', ') : 'none');
console.log('');
// Colon pattern
const colonPattern = /\d{1,2}:\d{2}/g;
const colonMatches = snippet.match(colonPattern);
console.log('Colon times (8:00, 9:30):');
console.log(colonMatches ? colonMatches.join(', ') : 'none');
console.log('');
// Polish day names
console.log('=== Polish day names in snippet ===\n');
const dayConfigs = getDayNamesForCountry('PL');
const dayPatterns = buildDayPatterns(dayConfigs);
for (const [dayName, dayNum] of Object.entries(dayPatterns)) {
if (snippet.includes(dayName)) {
console.log(`Found: ${dayName} (day ${dayNum})`);
}
}
}
}
console.log('\n=== Parsed schedules ===\n');
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
console.log(`${dayNames[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
}
}
await scraper.close();
}
debugPolish().catch(console.error);

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env tsx
/**
* Debug why Sunday and Monday aren't parsing for Polish church
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Exact schedule text from website
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
console.log('Text to parse:');
console.log(text);
console.log('\n');
const dayConfigs = getDayNamesForCountry('PL');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing niedziela (Sunday) ===\n');
// Current regex pattern
const niedziela = 'niedziela';
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = text.match(regex);
if (match) {
console.log(`✓ Matched!`);
console.log(` Full match: "${match[0].substring(0, 100)}"`);
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
console.log('');
// Check if times can be extracted
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
const times = match[1].match(spacePattern);
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ NOT matched`);
}
console.log('\n=== Testing poniedziałek (Monday) ===\n');
const ponieRegex = new RegExp(
`(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const ponieMatch = text.match(ponieRegex);
if (ponieMatch) {
console.log(`✓ Matched!`);
console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`);
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
console.log('');
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ NOT matched`);
}
console.log('\n=== Analyzing why niedziela might fail ===\n');
// The issue might be "niedziela i uroczystości:" - the phrase is long
// Check if the lookahead is hitting "uroczystości" before getting to the times
const niedziela_index = text.indexOf('niedziela');
const next_day_index = Math.min(
...sortedDayNames
.filter(d => d !== 'niedziela')
.map(d => text.indexOf(d, niedziela_index))
.filter(i => i > 0)
);
console.log(`niedziela position: ${niedziela_index}`);
console.log(`Next day name position: ${next_day_index}`);
console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find "montag bis donnerstag" pattern
const pattern = /montag[^]*?bis[^]*?donnerstag/gi;
const matches = [...text.matchAll(pattern)];
console.log(`Found ${matches.length} instances of "montag bis donnerstag":\n`);
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
const matchIndex = match.index || 0;
const contextBefore = text.substring(Math.max(0, matchIndex - 150), matchIndex);
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 250));
console.log(`=== Instance ${i + 1} ===`);
console.log(`Position: ${matchIndex}`);
console.log(`\nContext BEFORE (150 chars):`);
console.log(`"${contextBefore}"`);
console.log(`\nContext AFTER (250 chars):`);
console.log(`"${contextAfter}"`);
console.log('');
}
}
await scraper.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find all instances of "00 uhr" pattern
let idx = 0;
let count = 0;
const pattern = /\b00\s*uhr/g;
let match;
console.log('Looking for "00 uhr" patterns:\n');
while ((match = pattern.exec(text)) !== null) {
count++;
const matchIndex = match.index;
const contextBefore = text.substring(Math.max(0, matchIndex - 50), matchIndex);
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 100));
console.log(`=== Occurrence ${count} at position ${matchIndex} ===`);
console.log(`BEFORE: "...${contextBefore}"`);
console.log(`MATCH + AFTER: "${contextAfter}..."`);
console.log('');
}
console.log(`Total "00 uhr" occurrences: ${count}`);
}
await scraper.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env tsx
import { config } from 'dotenv';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import fs from 'fs/promises';
config({ path: '.env.local' });
async function main() {
console.log('📦 Exporting Germany from Neon...');
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
await prisma.$connect();
const churches = await prisma.churches.findMany({
where: { country: 'DE' },
include: {
massSchedules: true,
confessionSchedules: true,
adorationSchedules: true,
}
});
console.log(`Found ${churches.length} churches in Germany`);
await fs.writeFile('export-DE.json', JSON.stringify(churches, null, 2));
console.log(`✅ Exported to export-DE.json`);
await prisma.$disconnect();
await pool.end();
}
main().catch(console.error);

View File

@@ -0,0 +1,60 @@
#!/usr/bin/env tsx
/**
* Export churches from NAS database to JSON
* Run this ON THE NAS (uses DATABASE_URL from .env)
*/
import { PrismaClient } from '@prisma/client';
import fs from 'fs/promises';
async function main() {
const country = process.argv[2] || 'PL';
console.log(`📦 Exporting ${country} data from database...`);
console.log(`DATABASE_URL: ${process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@')}`);
const prisma = new PrismaClient();
try {
await prisma.$connect();
console.log('✅ Connected to database');
// Export churches with all schedules
const churches = await prisma.churches.findMany({
where: { country },
include: {
massSchedules: true,
confessionSchedules: true,
adorationSchedules: true,
}
});
console.log(`Found ${churches.length} churches in ${country}`);
// Count schedules
const massSchedules = churches.reduce((sum, c) => sum + (c.massSchedules?.length || 0), 0);
const confessionSchedules = churches.reduce((sum, c) => sum + (c.confessionSchedules?.length || 0), 0);
const adorationSchedules = churches.reduce((sum, c) => sum + (c.adorationSchedules?.length || 0), 0);
// Save to file
const exportFile = `export-${country}.json`;
await fs.writeFile(exportFile, JSON.stringify(churches, null, 2));
console.log(`\n✅ Exported to ${exportFile}`);
console.log(` - ${churches.length} churches`);
console.log(` - ${massSchedules} mass schedules`);
console.log(` - ${confessionSchedules} confession schedules`);
console.log(` - ${adorationSchedules} adoration schedules`);
console.log(`\nDownload with:`);
console.log(` scp albert@192.168.0.145:/volume1/docker/nearestmass/${exportFile} .`);
await prisma.$disconnect();
} catch (error) {
console.error('❌ Export failed:', error);
await prisma.$disconnect();
process.exit(1);
}
}
main().catch(console.error);

View File

@@ -0,0 +1,230 @@
#!/usr/bin/env tsx
/**
* Export churches from local NAS database and import to Neon
*/
import { PrismaClient } from '@prisma/client';
import fs from 'fs/promises';
import path from 'path';
interface ExportStats {
churches: number;
massSchedules: number;
confessionSchedules: number;
adorationSchedules: number;
}
async function exportFromNAS(country: string): Promise<ExportStats> {
console.log(`📦 Exporting ${country} data from NAS...`);
// Set DATABASE_URL to NAS
const originalUrl = process.env.DATABASE_URL;
process.env.DATABASE_URL = 'postgresql://postgres:postgres@192.168.0.145:5432/nearestmass';
const nasPrisma = new PrismaClient();
try {
await nasPrisma.$connect();
console.log('✅ Connected to NAS database');
// Export churches with all schedules
const churches = await nasPrisma.churches.findMany({
where: { country },
include: {
massSchedules: true,
confessionSchedules: true,
adorationSchedules: true,
}
});
console.log(`Found ${churches.length} churches in ${country}`);
// Count schedules
const stats: ExportStats = {
churches: churches.length,
massSchedules: churches.reduce((sum, c) => sum + (c.massSchedules?.length || 0), 0),
confessionSchedules: churches.reduce((sum, c) => sum + (c.confessionSchedules?.length || 0), 0),
adorationSchedules: churches.reduce((sum, c) => sum + (c.adorationSchedules?.length || 0), 0),
};
// Save to file
const exportFile = path.join(process.cwd(), `export-${country}.json`);
await fs.writeFile(exportFile, JSON.stringify(churches, null, 2));
console.log(`✅ Exported to ${exportFile}`);
console.log(` - ${stats.churches} churches`);
console.log(` - ${stats.massSchedules} mass schedules`);
console.log(` - ${stats.confessionSchedules} confession schedules`);
console.log(` - ${stats.adorationSchedules} adoration schedules`);
await nasPrisma.$disconnect();
// Restore original DATABASE_URL
if (originalUrl) {
process.env.DATABASE_URL = originalUrl;
}
return stats;
} catch (error) {
console.error('❌ Export failed:', error);
await nasPrisma.$disconnect();
// Restore original DATABASE_URL
if (originalUrl) {
process.env.DATABASE_URL = originalUrl;
}
throw error;
}
}
async function importToNeon(country: string, dryRun: boolean = true): Promise<void> {
console.log(`\n📤 Importing ${country} data to Neon...`);
if (dryRun) {
console.log('🔍 DRY RUN MODE - No data will be written');
}
// Read export file
const exportFile = path.join(process.cwd(), `export-${country}.json`);
const data = JSON.parse(await fs.readFile(exportFile, 'utf-8'));
console.log(`Loaded ${data.length} churches from export file`);
// Connect to Neon
const neonPrisma = new PrismaClient();
try {
await neonPrisma.$connect();
console.log('✅ Connected to Neon database');
let inserted = 0;
let updated = 0;
let errors = 0;
for (const church of data) {
try {
const massSchedules = church.massSchedules || [];
const confessionSchedules = church.confessionSchedules || [];
const adorationSchedules = church.adorationSchedules || [];
// Remove relations and ids
delete church.massSchedules;
delete church.confessionSchedules;
delete church.adorationSchedules;
delete church.id;
if (!dryRun) {
// Upsert church based on coordinates
const result = await neonPrisma.churches.upsert({
where: {
latitude_longitude: {
latitude: church.latitude,
longitude: church.longitude
}
},
create: church,
update: church
});
// Check if it was an insert or update
const existing = await neonPrisma.churches.findFirst({
where: {
latitude: church.latitude,
longitude: church.longitude,
createdAt: { lt: new Date(Date.now() - 1000) } // Created more than 1 sec ago
}
});
if (existing) {
updated++;
} else {
inserted++;
}
// Insert schedules
for (const schedule of massSchedules) {
delete schedule.id;
await neonPrisma.massSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
}
for (const schedule of confessionSchedules) {
delete schedule.id;
await neonPrisma.confessionSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
}
for (const schedule of adorationSchedules) {
delete schedule.id;
await neonPrisma.adorationSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
}
} else {
// Dry run - just count
inserted++;
}
if (inserted % 100 === 0) {
console.log(`Progress: ${inserted + updated} churches processed...`);
}
} catch (error) {
errors++;
console.error(`Error importing church ${church.name}:`, error instanceof Error ? error.message : error);
}
}
console.log('\n✅ Import complete!');
console.log(` - ${inserted} churches inserted`);
console.log(` - ${updated} churches updated`);
console.log(` - ${errors} errors`);
await neonPrisma.$disconnect();
} catch (error) {
console.error('❌ Import failed:', error);
await neonPrisma.$disconnect();
throw error;
}
}
async function main() {
const country = process.argv[2] || 'PL';
const mode = process.argv[3] || 'dry-run';
const dryRun = mode === 'dry-run';
console.log('🌍 Export/Import to Neon');
console.log('========================\n');
try {
// Step 1: Export from NAS
const stats = await exportFromNAS(country);
// Step 2: Import to Neon
await importToNeon(country, dryRun);
if (dryRun) {
console.log('\n💡 This was a DRY RUN. To actually import to Neon, run:');
console.log(` npx tsx scripts/export-import-to-neon.ts ${country} real-import`);
} else {
console.log('\n🎉 Data successfully uploaded to Neon!');
}
} catch (error) {
console.error('❌ Process failed:', error);
process.exit(1);
}
}
main().catch(console.error);

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find all instances of "donnerstag" (Thursday)
let idx = 0;
let count = 0;
while ((idx = text.indexOf('donnerstag', idx)) !== -1) {
count++;
const contextBefore = text.substring(Math.max(0, idx - 100), idx);
const contextAfter = text.substring(idx, Math.min(text.length, idx + 200));
console.log(`=== Donnerstag occurrence ${count} at position ${idx} ===`);
console.log(`BEFORE: "...${contextBefore}"`);
console.log(`AFTER: "${contextAfter}..."`);
console.log('');
idx++;
}
console.log(`Total "donnerstag" occurrences: ${count}`);
}
await scraper.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,42 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
const idx = text.indexOf('9.00 12.00');
if (idx !== -1) {
console.log('Context around "9.00 12.00":');
console.log(text.substring(Math.max(0, idx - 150), idx + 200));
} else {
console.log('Pattern "9.00 12.00" not found');
// Try alternative patterns
const patterns = ['9.00', '9:00', '09:00', '09.00'];
for (const pattern of patterns) {
const idx2 = text.indexOf(pattern);
if (idx2 !== -1) {
console.log(`\nFound "${pattern}" at position ${idx2}:`);
console.log(text.substring(Math.max(0, idx2 - 100), idx2 + 150));
break;
}
}
}
}
await scraper.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env tsx
/**
* Identify which churches are flagged as "parsing bugs" in top 5 test
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const COUNTRIES = [
{ code: 'FR', name: 'France' },
{ code: 'DE', name: 'Germany' },
{ code: 'ES', name: 'Spain' },
{ code: 'PL', name: 'Poland' },
{ code: 'BR', name: 'Brazil' },
];
async function identifyBugs() {
console.log('Identifying "parsing bugs" from top 5 test...\n');
const scraper = new GenericScraper();
await scraper.init();
const bugs: Array<{
country: string;
church: string;
url: string;
hasDays: boolean;
hasTimes: boolean;
}> = [];
for (const country of COUNTRIES) {
const churches = await prisma.church.findMany({
where: {
country: country.code,
website: { not: null },
source: 'osm',
},
take: 10,
orderBy: { createdAt: 'asc' },
});
scraper.setCountry(country.code);
for (const church of churches) {
try {
const result = await scraper.scrape(church.website!);
if (!result.success && result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check for day names and times
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|domingos|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|sábados|sabados|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|segundas|terça|terca|terças|tercas|quarta|quartas|quinta|quintas|sexta|sextas)\b/i);
const hasTimes = text.match(/\d{1,2}[h:\.]?\s*\d{0,2}\s*(am|pm|h|uhr)?/i);
if (hasDays && hasTimes) {
bugs.push({
country: country.name,
church: church.name,
url: church.website!,
hasDays: !!hasDays,
hasTimes: !!hasTimes,
});
}
}
} catch (err: any) {
// Skip errors
}
}
}
await scraper.close();
console.log(`\n${'='.repeat(80)}`);
console.log(`FOUND ${bugs.length} POTENTIAL PARSING BUGS\n`);
bugs.forEach((bug, i) => {
console.log(`${i + 1}. ${bug.church} (${bug.country})`);
console.log(` URL: ${bug.url}`);
console.log('');
});
await prisma.$disconnect();
await pool.end();
}
identifyBugs().catch(console.error);

View File

@@ -0,0 +1,232 @@
#!/usr/bin/env tsx
/**
* Import churches from JSON export to Neon database
* Run this LOCALLY (uses DATABASE_URL from .env pointing to Neon)
*/
import { PrismaClient } from '@prisma/client';
import fs from 'fs/promises';
import path from 'path';
interface ChurchExport {
id: string;
name: string;
latitude: number;
longitude: number;
country: string;
massSchedules?: any[];
confessionSchedules?: any[];
adorationSchedules?: any[];
[key: string]: any;
}
async function main() {
const country = process.argv[2] || 'PL';
const mode = process.argv[3] || 'dry-run';
const dryRun = mode === 'dry-run';
console.log(`📤 Importing ${country} data to Neon...`);
console.log(`DATABASE_URL: ${process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@')}`);
if (dryRun) {
console.log('🔍 DRY RUN MODE - No data will be written');
}
// Read export file
const exportFile = path.join(process.cwd(), `export-${country}.json`);
try {
const data: ChurchExport[] = JSON.parse(await fs.readFile(exportFile, 'utf-8'));
console.log(`Loaded ${data.length} churches from export file`);
// Connect to Neon
const prisma = new PrismaClient();
try {
await prisma.$connect();
console.log('✅ Connected to Neon database');
let inserted = 0;
let updated = 0;
let skipped = 0;
let errors = 0;
let totalMassSchedules = 0;
let totalConfessionSchedules = 0;
let totalAdorationSchedules = 0;
for (const church of data) {
try {
const massSchedules = church.massSchedules || [];
const confessionSchedules = church.confessionSchedules || [];
const adorationSchedules = church.adorationSchedules || [];
// Remove relations and ids
delete church.massSchedules;
delete church.confessionSchedules;
delete church.adorationSchedules;
delete church.id;
if (!dryRun) {
// Check if church already exists
const existing = await prisma.churches.findFirst({
where: {
latitude: church.latitude,
longitude: church.longitude
}
});
if (existing) {
// Update existing church
await prisma.churches.update({
where: { id: existing.id },
data: church
});
// Delete existing schedules
await prisma.massSchedules.deleteMany({
where: { churchId: existing.id }
});
await prisma.confessionSchedules.deleteMany({
where: { churchId: existing.id }
});
await prisma.adorationSchedules.deleteMany({
where: { churchId: existing.id }
});
// Insert new schedules
for (const schedule of massSchedules) {
delete schedule.id;
await prisma.massSchedules.create({
data: {
...schedule,
churchId: existing.id
}
});
totalMassSchedules++;
}
for (const schedule of confessionSchedules) {
delete schedule.id;
await prisma.confessionSchedules.create({
data: {
...schedule,
churchId: existing.id
}
});
totalConfessionSchedules++;
}
for (const schedule of adorationSchedules) {
delete schedule.id;
await prisma.adorationSchedules.create({
data: {
...schedule,
churchId: existing.id
}
});
totalAdorationSchedules++;
}
updated++;
} else {
// Create new church
const result = await prisma.churches.create({
data: church
});
// Insert schedules
for (const schedule of massSchedules) {
delete schedule.id;
await prisma.massSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
totalMassSchedules++;
}
for (const schedule of confessionSchedules) {
delete schedule.id;
await prisma.confessionSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
totalConfessionSchedules++;
}
for (const schedule of adorationSchedules) {
delete schedule.id;
await prisma.adorationSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
totalAdorationSchedules++;
}
inserted++;
}
} else {
// Dry run - just count
inserted++;
totalMassSchedules += massSchedules.length;
totalConfessionSchedules += confessionSchedules.length;
totalAdorationSchedules += adorationSchedules.length;
}
if ((inserted + updated) % 100 === 0) {
console.log(`Progress: ${inserted + updated} churches processed...`);
}
} catch (error) {
errors++;
console.error(`Error importing church ${church.name}:`, error instanceof Error ? error.message : error);
}
}
console.log('\n✅ Import complete!');
console.log(` - ${inserted} churches inserted`);
console.log(` - ${updated} churches updated`);
console.log(` - ${skipped} churches skipped`);
console.log(` - ${errors} errors`);
console.log(` - ${totalMassSchedules} mass schedules`);
console.log(` - ${totalConfessionSchedules} confession schedules`);
console.log(` - ${totalAdorationSchedules} adoration schedules`);
await prisma.$disconnect();
if (dryRun) {
console.log('\n💡 This was a DRY RUN. To actually import to Neon, run:');
console.log(` npx tsx scripts/import-to-neon.ts ${country} real-import`);
} else {
console.log('\n🎉 Data successfully uploaded to Neon!');
}
} catch (error) {
console.error('❌ Import failed:', error);
await prisma.$disconnect();
throw error;
}
} catch (error) {
if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
console.error(`❌ Export file not found: ${exportFile}`);
console.error(`\nFirst, export data from NAS:`);
console.error(` ssh albert@192.168.0.145`);
console.error(` cd /volume1/docker/nearestmass`);
console.error(` /usr/local/bin/docker compose --profile tools run --rm scraper npx tsx scripts/export-from-nas.ts ${country}`);
console.error(`\nThen download the export:`);
console.error(` scp albert@192.168.0.145:/volume1/docker/nearestmass/export-${country}.json .`);
console.error(`\nFinally, run this import script again.`);
} else {
console.error('❌ Process failed:', error);
}
process.exit(1);
}
}
main().catch(console.error);

View File

@@ -0,0 +1,84 @@
#!/usr/bin/env tsx
/**
* Investigate the 8 potential parsing bugs
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const BUGS = [
{ name: 'Chapelle Saint-Jean-XXIII', country: 'FR', url: 'https://www.chemin-neuf.fr/' },
{ name: 'St. Marien', country: 'DE', url: 'https://www.willehad.de/start/' },
{ name: 'Iglesia de San Fernando', country: 'ES', url: 'https://www.parroquiasanfernandomaspalomas.net/de/' },
{ name: 'Monestir de Sant Esperit', country: 'ES', url: 'https://www.santoespiritu.org/' },
{ name: 'Santuario de Manalagua', country: 'ES', url: 'http://tierrasdeburgos.blogspot.com.es/2013/12/escultura-del-agua-santuario-de.html' },
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', url: 'http://parafialubojna.pl' },
{ name: 'Paróquia do Desterro', country: 'BR', url: 'https://paroquiaportodegalinhas.blogspot.com.br/' },
{ name: 'Catedral Diocesana', country: 'BR', url: 'http://diocesedejuazeiro.org.br/' },
];
async function investigate() {
console.log('Investigating 8 potential bugs...\n');
const scraper = new GenericScraper();
await scraper.init();
for (let i = 0; i < BUGS.length; i++) {
const bug = BUGS[i];
console.log(`${'='.repeat(80)}`);
console.log(`${i + 1}. ${bug.name} (${bug.country})`);
console.log(` ${bug.url}`);
console.log('='.repeat(80));
scraper.setCountry(bug.country);
try {
const result = await scraper.scrape(bug.url);
console.log(`Success: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}`);
console.log(`Error: ${result.error || 'none'}`);
if (!result.success && result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check page type
console.log('\nPage analysis:');
if (text.includes('blogspot')) {
console.log(' ⚠️ Blogspot page (likely blog post, not church website)');
}
if (text.includes('hotel') || text.includes('reservation') || text.includes('booking')) {
console.log(' ⚠️ Contains hotel/booking keywords');
}
if (text.includes('restaurant') || text.includes('menu')) {
console.log(' ⚠️ Contains restaurant keywords');
}
if (text.includes('404') || text.includes('not found') || text.includes('error')) {
console.log(' ⚠️ Error/404 page');
}
// Check if it has schedule keywords
const hasScheduleKeywords = text.match(/(mass|messe|misa|missa|horário|horario|gottesdienst|eucarist)/i);
console.log(` Schedule keywords: ${hasScheduleKeywords ? '✓ Found' : '✗ Not found'}`);
// Show sample text
const massIndex = text.indexOf('mass') || text.indexOf('messe') || text.indexOf('misa') || text.indexOf('missa') || 0;
const sampleStart = Math.max(0, massIndex - 50);
const sample = text.substring(sampleStart, sampleStart + 300);
console.log(`\n Sample text: "${sample.substring(0, 200)}..."`);
}
console.log('\n');
} catch (err: any) {
console.log(`ERROR: ${err.message}\n\n`);
}
}
await scraper.close();
}
investigate().catch(console.error);

View File

@@ -0,0 +1,134 @@
import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function listChurchWebsites() {
try {
console.log('Fetching churches from database...\n');
const churches = await prisma.church.findMany({
select: {
id: true,
name: true,
city: true,
state: true,
country: true,
website: true,
googlePlaceId: true,
},
orderBy: [
{ country: 'asc' },
{ state: 'asc' },
{ city: 'asc' },
],
});
console.log(`Total churches: ${churches.length}`);
const withWebsite = churches.filter(c => c.website);
const withGoogle = churches.filter(c => c.googlePlaceId);
const withoutWebsite = churches.filter(c => !c.website);
console.log(`Churches with website: ${withWebsite.length}`);
console.log(`Churches with Google Place ID: ${withGoogle.length}`);
console.log(`Churches without website: ${withoutWebsite.length}\n`);
// Group by country
const byCountry = churches.reduce((acc, church) => {
const country = church.country || 'Unknown';
if (!acc[country]) {
acc[country] = [];
}
acc[country].push(church);
return acc;
}, {} as Record<string, typeof churches>);
// Write to file
let output = '# Church Websites\n\n';
output += `Generated: ${new Date().toISOString()}\n\n`;
output += `## Summary\n`;
output += `- Total churches: ${churches.length}\n`;
output += `- With website: ${withWebsite.length} (${((withWebsite.length / churches.length) * 100).toFixed(1)}%)\n`;
output += `- With Google Place ID: ${withGoogle.length} (${((withGoogle.length / churches.length) * 100).toFixed(1)}%)\n`;
output += `- Without website: ${withoutWebsite.length} (${((withoutWebsite.length / churches.length) * 100).toFixed(1)}%)\n\n`;
// Add country breakdown
output += `## By Country\n\n`;
Object.entries(byCountry)
.sort(([, a], [, b]) => b.length - a.length)
.forEach(([country, countryChurches]) => {
const withSite = countryChurches.filter(c => c.website).length;
const withGoogle = countryChurches.filter(c => c.googlePlaceId).length;
output += `### ${country} (${countryChurches.length} churches)\n`;
output += `- With website: ${withSite} (${((withSite / countryChurches.length) * 100).toFixed(1)}%)\n`;
output += `- With Google Place ID: ${withGoogle} (${((withGoogle / countryChurches.length) * 100).toFixed(1)}%)\n\n`;
});
// List all websites
output += `## All Websites\n\n`;
Object.entries(byCountry)
.sort(([a], [b]) => a.localeCompare(b))
.forEach(([country, countryChurches]) => {
output += `### ${country}\n\n`;
countryChurches.forEach(church => {
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
if (church.website) {
output += `- **${church.name}** (${location})\n`;
output += ` - Website: ${church.website}\n`;
if (church.googlePlaceId) {
output += ` - Google Place ID: ${church.googlePlaceId}\n`;
}
output += ` - DB ID: ${church.id}\n\n`;
}
});
});
// List churches without websites
output += `## Churches Without Websites\n\n`;
Object.entries(byCountry)
.sort(([a], [b]) => a.localeCompare(b))
.forEach(([country, countryChurches]) => {
const without = countryChurches.filter(c => !c.website);
if (without.length > 0) {
output += `### ${country}\n\n`;
without.forEach(church => {
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
output += `- **${church.name}** (${location})\n`;
if (church.googlePlaceId) {
output += ` - Google Place ID: ${church.googlePlaceId}\n`;
}
output += ` - DB ID: ${church.id}\n\n`;
});
}
});
// Write to file
const fs = await import('fs/promises');
await fs.writeFile('church-websites.md', output);
console.log('✓ Written to church-websites.md');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
listChurchWebsites();

View File

@@ -0,0 +1,44 @@
import { Pool } from 'pg';
import * as dotenv from 'dotenv';
import * as path from 'path';
// Load .env.local first (takes precedence), then .env
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
async function listTables() {
try {
console.log('Connecting to database...');
console.log('DATABASE_URL:', process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@'));
// List all tables
const result = await pool.query(`
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_name;
`);
console.log('\n=== Tables in Database ===');
if (result.rows.length === 0) {
console.log('No tables found!');
} else {
result.rows.forEach((row) => {
console.log(`- ${row.table_name}`);
});
}
console.log(`\nTotal: ${result.rows.length} tables`);
} catch (error) {
console.error('Error listing tables:', error);
} finally {
await pool.end();
}
}
listTables();

View File

@@ -0,0 +1,167 @@
const { Client } = require("pg");
const client = new Client({
connectionString: "postgresql://postgres:postgres@192.168.0.145:5434/nearestmass"
});
const queries = [
{
name: "1. Overall church counts by country (top 20)",
sql: `SELECT country, COUNT(*) as total,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped,
COUNT(*) FILTER (WHERE has_website = true) as has_website_flag,
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_language
FROM churches
GROUP BY country
ORDER BY total DESC
LIMIT 20`
},
{
name: "2. Total mass schedule counts",
sql: `SELECT COUNT(*) as total_schedules,
COUNT(DISTINCT church_id) as churches_with_schedules
FROM mass_schedules`
},
{
name: "3. Scrape results by language",
sql: `SELECT website_language as language,
COUNT(*) as total_scraped,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
FROM churches
WHERE website_language IS NOT NULL
GROUP BY website_language
ORDER BY total_scraped DESC`
},
{
name: "4. Churches with websites but never scraped",
sql: `SELECT COUNT(*) as has_website_not_scraped
FROM churches
WHERE website IS NOT NULL AND last_scraped_at IS NULL`
},
{
name: "5. Overall pipeline funnel",
sql: `SELECT
COUNT(*) as total_churches,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as attempted_scrape,
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_detected_language,
(SELECT COUNT(DISTINCT church_id) FROM mass_schedules) as has_schedules_saved,
(SELECT COUNT(*) FROM mass_schedules) as total_schedule_rows
FROM churches`
},
{
name: "6. Recent scrape activity (last 7 days) by language",
sql: `SELECT website_language as language,
COUNT(*) as scraped_last_7d
FROM churches
WHERE last_scraped_at > NOW() - INTERVAL '7 days'
GROUP BY website_language
ORDER BY scraped_last_7d DESC`
},
{
name: "7. Background job history (last 15 completed/failed jobs)",
sql: `SELECT type, language, status,
created_at::date as created,
completed_at::date as completed,
ROUND(CAST(EXTRACT(EPOCH FROM (completed_at - created_at))/3600 AS numeric), 2) as hours,
total_items, processed, succeeded, failed
FROM background_jobs
WHERE status IN ('completed', 'failed')
ORDER BY completed_at DESC
LIMIT 15`
},
{
name: "8. Mass schedule breakdown by day of week",
sql: `SELECT day_of_week,
CASE day_of_week
WHEN 0 THEN 'Sunday' WHEN 1 THEN 'Monday' WHEN 2 THEN 'Tuesday'
WHEN 3 THEN 'Wednesday' WHEN 4 THEN 'Thursday' WHEN 5 THEN 'Friday'
WHEN 6 THEN 'Saturday' ELSE 'Other'
END as day_name,
COUNT(*) as count
FROM mass_schedules
GROUP BY day_of_week
ORDER BY day_of_week`
},
{
name: "9. Churches with schedules by country (top 15)",
sql: `SELECT c.country,
COUNT(DISTINCT c.id) as total_churches,
COUNT(DISTINCT ms.church_id) as churches_with_schedules,
ROUND(100.0 * COUNT(DISTINCT ms.church_id) / NULLIF(COUNT(DISTINCT c.id), 0), 1) as coverage_pct,
COUNT(ms.id) as total_schedule_rows
FROM churches c
LEFT JOIN mass_schedules ms ON ms.church_id = c.id
GROUP BY c.country
ORDER BY total_churches DESC
LIMIT 15`
},
{
name: "10. Enrichment sources - how churches were found",
sql: `SELECT source, COUNT(*) as count
FROM churches
GROUP BY source
ORDER BY count DESC`
},
{
name: "11. Google Places enrichment impact",
sql: `SELECT
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL AND website IS NOT NULL) as google_with_website,
COUNT(*) FILTER (WHERE google_place_id IS NULL) as no_google_place,
COUNT(*) FILTER (WHERE google_searched_at IS NOT NULL) as google_searched,
COUNT(*) FILTER (WHERE free_searched_at IS NOT NULL) as free_searched
FROM churches`
},
{
name: "12. Website presence by source",
sql: `SELECT source,
COUNT(*) as total,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
ROUND(100.0 * COUNT(*) FILTER (WHERE website IS NOT NULL) / NULLIF(COUNT(*), 0), 1) as website_pct,
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
FROM churches
GROUP BY source
ORDER BY total DESC`
}
];
async function run() {
await client.connect();
for (const q of queries) {
console.log("=".repeat(90));
console.log(q.name);
console.log("=".repeat(90));
try {
const res = await client.query(q.sql);
if (res.rows.length === 0) {
console.log("(no rows returned)");
} else {
// Calculate column widths
const cols = Object.keys(res.rows[0]);
const widths = cols.map(c => {
const maxData = Math.max(...res.rows.map(r => String(r[c] ?? "NULL").length));
return Math.max(c.length, maxData);
});
// Print header
console.log(cols.map((c, i) => c.padEnd(widths[i])).join(" | "));
console.log(widths.map(w => "-".repeat(w)).join("-+-"));
// Print rows
for (const row of res.rows) {
console.log(cols.map((c, i) => String(row[c] ?? "NULL").padEnd(widths[i])).join(" | "));
}
}
console.log("\n(" + res.rows.length + " rows)\n");
} catch (err) {
console.log("ERROR:", err.message, "\n");
}
}
await client.end();
}
run().catch(e => { console.error(e); process.exit(1); });

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env tsx
/**
* Show detailed output from a successful French parse
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function showSuccess() {
// One of our successful churches with 16 schedules
const url = 'https://laportelatine.org/lieux/couvent-saint-francois-morgon';
console.log(`Detailed parse of: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
const result = await scraper.scrape(url);
console.log(`✅ Success: ${result.success}`);
console.log(`📅 Schedules found: ${result.schedules.length}\n`);
// Group by day
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Dimanche', 'Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi'];
console.log('═══════════════════════════════════════════════');
console.log('PARSED SCHEDULE:');
console.log('═══════════════════════════════════════════════\n');
Object.entries(byDay)
.sort(([a], [b]) => parseInt(a) - parseInt(b))
.forEach(([day, scheds]) => {
console.log(`${dayNames[parseInt(day)]}:`);
scheds.forEach(s => {
console.log(` ${s.time} - ${s.language} ${s.massType}`);
});
console.log('');
});
await scraper.close();
}
showSuccess().catch(console.error);

View File

@@ -0,0 +1,28 @@
#!/usr/bin/env tsx
/**
* Test database connection
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
console.log('DATABASE_URL exists:', !!process.env.DATABASE_URL);
console.log('DATABASE_URL value:', process.env.DATABASE_URL?.substring(0, 50) + '...');
import { prisma } from '../../src/lib/db';
async function testConnection() {
try {
const count = await prisma.church.count();
console.log(`✅ Database connection successful!`);
console.log(`Total churches in database: ${count}`);
} catch (err: any) {
console.log(`❌ Database connection failed:`);
console.log(err.message);
} finally {
await prisma.$disconnect();
}
}
testConnection();

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env tsx
/**
* Test more French churches and collect diagnostic data
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface DiagnosticInfo {
url: string;
churchName: string;
success: boolean;
schedulesFound: number;
hasFrenchDays: boolean;
hasTimePatterns: boolean;
timePatternsSample: string[];
textSample: string;
error?: string;
}
async function testFrenchBroader() {
console.log('Testing 20 French churches with diagnostics...\n');
// Get more French churches
const churches = await prisma.church.findMany({
where: {
country: 'FR',
website: { not: null },
source: 'osm',
},
take: 20,
orderBy: { createdAt: 'asc' },
});
if (churches.length === 0) {
console.log('No French churches found.');
await prisma.$disconnect();
await pool.end();
return;
}
console.log(`Found ${churches.length} French churches to test\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
let successCount = 0;
let failCount = 0;
const diagnostics: DiagnosticInfo[] = [];
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name} (${church.city || 'Unknown'})`);
console.log(`URL: ${church.website}`);
try {
const result = await scraper.scrape(church.website!);
// Extract diagnostics
let hasFrenchDays = false;
let hasTimePatterns = false;
let timePatternsSample: string[] = [];
let textSample = '';
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
textSample = text.substring(0, 500);
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
hasFrenchDays = frenchDays.some(day => text.includes(day));
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h)?/g;
const times = text.match(timeRegex);
if (times) {
hasTimePatterns = true;
timePatternsSample = [...new Set(times)].slice(0, 10);
}
}
const diagnostic: DiagnosticInfo = {
url: church.website!,
churchName: church.name,
success: result.success,
schedulesFound: result.schedules.length,
hasFrenchDays,
hasTimePatterns,
timePatternsSample,
textSample,
error: result.error,
};
diagnostics.push(diagnostic);
if (result.success && result.schedules.length > 0) {
successCount++;
console.log(`✅ SUCCESS - ${result.schedules.length} schedules`);
} else {
failCount++;
console.log(`❌ FAILED - ${result.error}`);
if (hasFrenchDays && !hasTimePatterns) {
console.log(` 💡 Has French days but no times`);
} else if (!hasFrenchDays && hasTimePatterns) {
console.log(` 💡 Has times but no French days`);
} else if (hasFrenchDays && hasTimePatterns) {
console.log(` 💡 Has BOTH days and times - parsing issue!`);
console.log(` Sample times: ${timePatternsSample.slice(0, 5).join(', ')}`);
} else {
console.log(` 💡 No mass schedule content found`);
}
}
console.log('');
} catch (err: any) {
failCount++;
console.log(`❌ ERROR - ${err.message}\n`);
diagnostics.push({
url: church.website!,
churchName: church.name,
success: false,
schedulesFound: 0,
hasFrenchDays: false,
hasTimePatterns: false,
timePatternsSample: [],
textSample: '',
error: err.message,
});
}
}
await scraper.close();
// Analysis
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
console.log('');
const hasBoth = diagnostics.filter(d => !d.success && d.hasFrenchDays && d.hasTimePatterns);
const hasDaysNoTimes = diagnostics.filter(d => !d.success && d.hasFrenchDays && !d.hasTimePatterns);
const hasTimesNoDays = diagnostics.filter(d => !d.success && !d.hasFrenchDays && d.hasTimePatterns);
const hasNeither = diagnostics.filter(d => !d.success && !d.hasFrenchDays && !d.hasTimePatterns);
console.log('FAILURE ANALYSIS:');
console.log(` Has days + times but failed: ${hasBoth.length} (PARSING BUG)`);
console.log(` Has days but no times: ${hasDaysNoTimes.length}`);
console.log(` Has times but no days: ${hasTimesNoDays.length}`);
console.log(` Has neither: ${hasNeither.length} (no mass schedule on page)`);
console.log('');
if (hasBoth.length > 0) {
console.log('⚠️ PARSING BUGS TO FIX (has both days and times but failed):');
hasBoth.forEach(d => {
console.log(` ${d.churchName}`);
console.log(` URL: ${d.url}`);
console.log(` Sample times found: ${d.timePatternsSample.slice(0, 5).join(', ')}`);
console.log(` Text sample: ${d.textSample.substring(0, 150)}...`);
console.log('');
});
}
await prisma.$disconnect();
await pool.end();
}
testFrenchBroader().catch(console.error);

View File

@@ -0,0 +1,100 @@
#!/usr/bin/env tsx
/**
* Test international scraper against French churches
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function testFrenchScraper() {
console.log('Testing French church mass schedule scraping...\n');
// Get French churches with websites
const churches = await prisma.church.findMany({
where: {
country: 'FR',
website: { not: null },
source: 'osm',
},
take: 5,
orderBy: { createdAt: 'asc' },
});
if (churches.length === 0) {
console.log('No French churches with websites found.');
await prisma.$disconnect();
await pool.end();
return;
}
console.log(`Found ${churches.length} French churches to test:\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
let successCount = 0;
let failCount = 0;
for (const church of churches) {
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Church: ${church.name}`);
console.log(`City: ${church.city || 'Unknown'}`);
console.log(`URL: ${church.website}`);
console.log('');
try {
const result = await scraper.scrape(church.website!);
if (result.success && result.schedules.length > 0) {
successCount++;
console.log(`✅ SUCCESS - Found ${result.schedules.length} schedules\n`);
// Group by day and show
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Dimanche', 'Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi'];
Object.entries(byDay).forEach(([day, scheds]) => {
console.log(` ${dayNames[parseInt(day)]}:`);
scheds.forEach(s => {
console.log(` ${s.time} - ${s.language || 'Unknown'} (${s.massType || 'Mass'})`);
});
});
console.log('');
} else {
failCount++;
console.log(`❌ FAILED - ${result.error}`);
console.log('');
}
} catch (err: any) {
failCount++;
console.log(`❌ ERROR - ${err.message}`);
console.log('');
}
}
await scraper.close();
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
console.log(`Success: ${successCount}, Failed: ${failCount}\n`);
await prisma.$disconnect();
await pool.end();
}
testFrenchScraper().catch(console.error);

View File

@@ -0,0 +1,210 @@
#!/usr/bin/env tsx
/**
* Test scraper on a diverse sample of international churches
* to identify edge cases across different languages and formats
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
interface TestChurch {
name: string;
url: string;
country: string;
language: string;
expectedDays?: string; // e.g., "Sun-Sat" or "Sun, Wed, Sat"
notes?: string;
}
// Sample churches from different countries/languages
const testChurches: TestChurch[] = [
// FRENCH
{
name: 'Saint-Étienne du Mont, Paris',
url: 'https://www.saintetiennedumontparis.fr/',
country: 'FR',
language: 'French',
notes: 'French format with "du lundi au vendredi"',
},
{
name: 'Notre-Dame de la Garde, Marseille',
url: 'https://www.notredamedelagarde.fr/',
country: 'FR',
language: 'French',
notes: 'Major pilgrimage site',
},
// GERMAN
{
name: 'St. Peter, Munich',
url: 'https://www.alterpeter.de/',
country: 'DE',
language: 'German',
notes: 'German format with "bis" for ranges',
},
{
name: 'Kölner Dom, Cologne',
url: 'https://www.koelner-dom.de/',
country: 'DE',
language: 'German',
notes: 'Cathedral with Uhr time format',
},
// SPANISH
{
name: 'Sagrada Família, Barcelona',
url: 'https://sagradafamilia.org/',
country: 'ES',
language: 'Spanish',
notes: 'Major tourist site, may have complex schedule',
},
{
name: 'Parroquia San Miguel, Madrid',
url: 'https://www.parroquiasanmiguel.es/',
country: 'ES',
language: 'Spanish',
notes: 'Spanish format with "de lunes a viernes"',
},
// PORTUGUESE
{
name: 'Basílica da Estrela, Lisbon',
url: 'https://www.basilicadaestrela.com/',
country: 'PT',
language: 'Portuguese',
notes: 'Portuguese format',
},
// ITALIAN
{
name: 'Santa Maria Maggiore, Rome',
url: 'https://www.vatican.va/various/basiliche/sm_maggiore/index_it.htm',
country: 'IT',
language: 'Italian',
notes: 'Major basilica',
},
{
name: 'Duomo di Milano',
url: 'https://www.duomomilano.it/',
country: 'IT',
language: 'Italian',
notes: 'Cathedral with Italian format',
},
// DUTCH
{
name: 'Basiliek van de H. Nicolaas, Amsterdam',
url: 'https://www.nicolaas-parochie.nl/',
country: 'NL',
language: 'Dutch',
notes: 'Dutch format with "tot" for ranges',
},
// CZECH
{
name: 'Chrám sv. Víta, Prague',
url: 'https://www.katedralasvatehovita.cz/',
country: 'CZ',
language: 'Czech',
notes: 'Czech format',
},
// HUNGARIAN
{
name: 'Szent István Bazilika, Budapest',
url: 'https://www.bazilika.biz/',
country: 'HU',
language: 'Hungarian',
notes: 'Hungarian format',
},
// More complex cases
{
name: 'Cathédrale Notre-Dame, Strasbourg',
url: 'https://www.cathedrale-strasbourg.fr/',
country: 'FR',
language: 'French',
notes: 'Bilingual region (French/German)',
},
];
async function testChurch(church: TestChurch, scraper: GenericScraper): Promise<void> {
console.log(`\n${'='.repeat(80)}`);
console.log(`📍 ${church.name}`);
console.log(` ${church.url}`);
console.log(` Language: ${church.language} | Country: ${church.country}`);
if (church.notes) console.log(` Notes: ${church.notes}`);
console.log(`${'='.repeat(80)}`);
try {
scraper.setCountry(church.country);
const result = await scraper.scrape(church.url);
if (!result.success) {
console.log(`❌ FAILED: ${result.error || 'Unknown error'}`);
return;
}
if (result.schedules.length === 0) {
console.log(`⚠️ SUCCESS but NO SCHEDULES found`);
return;
}
// Group by day
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
console.log(`\n✅ Found ${result.schedules.length} schedules:\n`);
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
const times = byDay[i].map(s => {
let str = s.time;
if (s.massType) str += ` (${s.massType})`;
if (s.language && s.language !== 'English') str += ` [${s.language}]`;
return str;
}).join(', ');
console.log(` ${dayNames[i]}: ${times}`);
}
}
} catch (error) {
console.log(`❌ ERROR: ${error instanceof Error ? error.message : String(error)}`);
}
}
async function main() {
const scraper = new GenericScraper();
await scraper.init();
console.log('🌍 INTERNATIONAL CHURCH SCRAPER TEST');
console.log(`Testing ${testChurches.length} churches across ${new Set(testChurches.map(c => c.country)).size} countries`);
const results: { success: number; failed: number; noSchedules: number } = {
success: 0,
failed: 0,
noSchedules: 0,
};
for (const church of testChurches) {
await testChurch(church, scraper);
// Brief delay between requests to be respectful
await new Promise(resolve => setTimeout(resolve, 2000));
}
await scraper.close();
console.log(`\n${'='.repeat(80)}`);
console.log('📊 SUMMARY');
console.log(`${'='.repeat(80)}`);
console.log(`Total tested: ${testChurches.length}`);
console.log(`✅ Success with schedules: ${results.success}`);
console.log(`⚠️ Success but no schedules: ${results.noSchedules}`);
console.log(`❌ Failed: ${results.failed}`);
}
main().catch(console.error);

View File

@@ -0,0 +1,36 @@
/**
* Quick test script to verify the masstimes.org JSON API scraper works
* Usage: npx tsx scripts/test-masstimes-api.ts
*/
import { writeFileSync } from 'fs';
import { MassTimesScraper } from '../../src/lib/masstimes-scraper';
async function main() {
console.log('Testing MassTimes.org JSON API Scraper\n');
const scraper = new MassTimesScraper();
try {
await scraper.init();
console.log('Browser initialized\n');
const lat = 34.852;
const lng = -82.394;
console.log(`Fetching churches near Greenville, SC (${lat}, ${lng})...\n`);
const churches = await scraper.scrapeByLocation(lat, lng);
const outPath = 'scraped-churches.json';
writeFileSync(outPath, JSON.stringify(churches, null, 2));
console.log(`\nSaved ${churches.length} churches to ${outPath}`);
} catch (error) {
console.error('TEST FAILED:', error);
process.exit(1);
} finally {
await scraper.close();
}
}
main();

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env tsx
/**
* Test which sections are being created for Polish church
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Exact text from the page
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
console.log('Text:');
console.log(text);
console.log('\n');
const dayConfigs = getDayNamesForCountry('PL');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing individual day matching ===\n');
// Test niedziela specifically
const niedziela = 'niedziela';
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = text.match(regex);
if (match) {
console.log(`✓ niedziela matched!`);
console.log(` Full match: "${match[0].substring(0, 100)}"`);
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
console.log('');
// Test if times can be extracted from captured text
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
const times = match[1].match(spacePattern);
console.log(` Times in captured text: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ niedziela NOT matched`);
console.log('');
// Try simpler regex
const simpleRegex = /niedziela[:\s]+(.{0,100})/i;
const simpleMatch = text.match(simpleRegex);
if (simpleMatch) {
console.log(`Simple regex matched: "${simpleMatch[1]}"`);
}
}
// Test poniedziałek
console.log('\n=== Testing poniedziałek ===\n');
const ponieRegex = new RegExp(
`(?:^|\\s|[,;:])poniedziałek[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const ponieMatch = text.match(ponieRegex);
if (ponieMatch) {
console.log(`✓ poniedziałek matched!`);
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
console.log(` Times: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ poniedziałek NOT matched`);
}

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env tsx
/**
* Test Polish church with detailed section logging
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
// Temporarily modify GenericScraper to add logging
const originalParse = GenericScraper.prototype['parseSchedules'];
GenericScraper.prototype['parseSchedules'] = function(html: string) {
const text = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Call findScheduleSections and log result
const sections = this['findScheduleSections'](text);
console.log('\n=== Sections found by findScheduleSections() ===\n');
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
sections.forEach((section: any, i: number) => {
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
console.log(` Text: "${section.text.substring(0, 80)}..."`);
});
console.log(`\nTotal sections: ${sections.length}\n`);
// Continue with normal processing
return originalParse.call(this, html);
};
async function testPolish() {
const url = 'http://parafialubojna.pl';
console.log(`Testing: ${url}`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('PL');
const result = await scraper.scrape(url);
console.log(`\nFinal result: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}\n`);
if (result.schedules.length > 0) {
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNamesPL = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
console.log('Parsed schedules by day:');
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
console.log(` ${dayNamesPL[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
}
}
}
await scraper.close();
}
testPolish().catch(console.error);

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env tsx
/**
* Test which pattern is matching "00" time
*/
// Test text from German church
const testText = "10:00 uhr lateinisches amt";
const timePatterns = [
{ name: '12-hour AM/PM', pattern: /(\d{1,2}):(\d{2})\s*(AM|PM|am|pm|a\.m\.|p\.m\.)/g },
{ name: '12-hour no minutes', pattern: /(?<![:\d])(\d{1,2})\s*(AM|PM|am|pm|a\.m\.|p\.m\.)/g },
{ name: '24-hour colon', pattern: /(?<![:\d\w])(\d{1,2}):(\d{2})(?!\s*(AM|PM|am|pm))/g },
{ name: 'French/Portuguese h', pattern: /(?<![:\d\w])(\d{1,2})\s*h\s*(\d{2})?(?!\w)/gi },
{ name: 'Italian period', pattern: /(?<![:\d\w])(\d{1,2})\.(\d{2})(?=\s|$|,|;|\)|\])/g },
{ name: 'German Uhr (old)', pattern: /(\d{1,2})[:\.]?(\d{2})?\s*Uhr/gi },
{ name: 'German Uhr (fixed)', pattern: /(?<![:\d])(\d{1,2})[:\.]?(\d{2})?\s*Uhr/gi },
{ name: 'Polish space', pattern: /\b(\d{1,2})\s+(\d{2})(?!\d)/g },
];
console.log(`Test text: "${testText}"\n`);
for (const { name, pattern } of timePatterns) {
const matches = [...testText.matchAll(pattern)];
if (matches.length > 0) {
console.log(`${name}:`);
for (const match of matches) {
console.log(` Matched: "${match[0]}" at index ${match.index}`);
}
} else {
console.log(`${name}: no match`);
}
}
// Now test with just "00 uhr"
console.log(`\n${'='.repeat(60)}\n`);
const testText2 = "00 uhr lateinisches";
console.log(`Test text: "${testText2}"\n`);
for (const { name, pattern } of timePatterns) {
const matches = [...testText2.matchAll(pattern)];
if (matches.length > 0) {
console.log(`${name}:`);
for (const match of matches) {
console.log(` Matched: "${match[0]}" at index ${match.index}`);
}
} else {
console.log(`${name}: no match`);
}
}

View File

@@ -0,0 +1,193 @@
#!/usr/bin/env tsx
/**
* Quick test of top 5 priority countries
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const COUNTRIES = [
{ code: 'FR', name: 'France' },
{ code: 'DE', name: 'Germany' },
{ code: 'ES', name: 'Spain' },
{ code: 'PL', name: 'Poland' },
{ code: 'BR', name: 'Brazil' },
];
const PER_COUNTRY = 10;
interface CountryResult {
country: string;
countryName: string;
tested: number;
success: number;
failed: number;
successRate: number;
hasBothButFailed: number; // Has days + times but parsing failed
totalSchedules: number;
sampleSuccess?: string;
}
async function testTop5() {
console.log('Testing top 5 priority countries (10 churches each)...\n');
const scraper = new GenericScraper();
await scraper.init();
const results: CountryResult[] = [];
for (const country of COUNTRIES) {
console.log(`\n${'='.repeat(60)}`);
console.log(`Testing ${country.name} (${country.code})`);
console.log('='.repeat(60));
const churches = await prisma.church.findMany({
where: {
country: country.code,
website: { not: null },
source: 'osm',
},
take: PER_COUNTRY,
orderBy: { createdAt: 'asc' },
});
if (churches.length === 0) {
console.log(`No churches with websites found for ${country.name}\n`);
continue;
}
scraper.setCountry(country.code);
let success = 0;
let failed = 0;
let hasBothButFailed = 0;
let totalSchedules = 0;
let sampleSuccess: string | undefined;
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
try {
const result = await scraper.scrape(church.website!);
if (result.success && result.schedules.length > 0) {
success++;
totalSchedules += result.schedules.length;
process.stdout.write(`${result.schedules.length} schedules\n`);
if (!sampleSuccess && result.schedules.length > 0) {
sampleSuccess = `${church.name}: ${result.schedules.length} schedules`;
}
} else {
failed++;
process.stdout.write(`${result.error}\n`);
// Check if has both days and times (parsing bug indicator)
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check for day names in any language
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|terça|terca|quarta|quinta|sexta)\b/i);
const hasTimes = text.match(/\d{1,2}[h:\.]\s*\d{0,2}/);
if (hasDays && hasTimes) {
hasBothButFailed++;
process.stdout.write(` ⚠️ Has days + times but failed to parse\n`);
}
}
}
} catch (err: any) {
failed++;
process.stdout.write(`❌ ERROR: ${err.message}\n`);
}
}
const successRate = churches.length > 0 ? (success / churches.length) * 100 : 0;
results.push({
country: country.code,
countryName: country.name,
tested: churches.length,
success,
failed,
successRate,
hasBothButFailed,
totalSchedules,
sampleSuccess,
});
console.log(`\n${country.name} Summary: ${success}/${churches.length} (${successRate.toFixed(0)}%)`);
console.log(` Total schedules extracted: ${totalSchedules}`);
if (hasBothButFailed > 0) {
console.log(` ⚠️ Parsing bugs: ${hasBothButFailed} (has content but failed to parse)`);
}
}
await scraper.close();
// Final summary
console.log('\n\n');
console.log('═'.repeat(80));
console.log('FINAL RESULTS - TOP 5 COUNTRIES');
console.log('═'.repeat(80));
console.log('');
console.log('Country | Tested | Success | Rate | Schedules | Bugs');
console.log('─'.repeat(80));
const totalTested = results.reduce((sum, r) => sum + r.tested, 0);
const totalSuccess = results.reduce((sum, r) => sum + r.success, 0);
const totalSchedules = results.reduce((sum, r) => sum + r.totalSchedules, 0);
const totalBugs = results.reduce((sum, r) => sum + r.hasBothButFailed, 0);
results.forEach(r => {
const country = r.countryName.padEnd(12);
const tested = String(r.tested).padStart(6);
const success = String(r.success).padStart(7);
const rate = `${r.successRate.toFixed(0)}%`.padStart(5);
const schedules = String(r.totalSchedules).padStart(9);
const bugs = r.hasBothButFailed > 0 ? `⚠️ ${r.hasBothButFailed}` : '✓';
console.log(`${country} | ${tested} | ${success} | ${rate} | ${schedules} | ${bugs}`);
});
console.log('─'.repeat(80));
const avgRate = totalTested > 0 ? (totalSuccess / totalTested) * 100 : 0;
console.log(`OVERALL | ${String(totalTested).padStart(6)} | ${String(totalSuccess).padStart(7)} | ${avgRate.toFixed(0).padStart(4)}% | ${String(totalSchedules).padStart(9)} | ${totalBugs > 0 ? `⚠️ ${totalBugs}` : '✓'}`);
console.log('');
console.log('═'.repeat(80));
console.log('');
if (totalBugs > 0) {
console.log(`⚠️ ${totalBugs} parsing bugs detected (has days + times but failed)`);
console.log(' These need investigation and fixes.\n');
} else {
console.log('✅ No parsing bugs! All failures are legitimate (no content or wrong page).\n');
}
console.log(`Total churches tested: ${totalTested}`);
console.log(`Total successful: ${totalSuccess} (${avgRate.toFixed(1)}%)`);
console.log(`Total mass schedules extracted: ${totalSchedules}`);
console.log('');
await prisma.$disconnect();
await pool.end();
}
testTop5().catch(console.error);

View File

@@ -0,0 +1,173 @@
#!/usr/bin/env tsx
/**
* Test website scraper on churches with websites
* Analyzes which websites can be scraped successfully
*/
// Load .env
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import fs from 'fs';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface TestResult {
churchId: string;
name: string;
website: string;
country: string;
success: boolean;
massesFound: number;
schedules?: { dayOfWeek: number; time: string; massType?: string; language?: string }[];
error?: string;
}
function normalizeUrl(url: string): string {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
return `https://${url}`;
}
return url;
}
async function testScrapers(limit: number = 50, country?: string) {
const results: TestResult[] = [];
// Get churches with websites
const whereClause: any = {
website: { not: null },
};
if (country) {
whereClause.country = country;
}
const churches = await prisma.church.findMany({
where: whereClause,
take: limit,
orderBy: { createdAt: 'desc' },
});
console.log(`Testing ${churches.length} churches with websites...\n`);
// Initialize the scraper (launches Playwright browser)
const scraper = new GenericScraper();
await scraper.init();
try {
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
const url = normalizeUrl(church.website!);
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name}`);
console.log(` Website: ${url}`);
try {
const result = await scraper.scrape(url);
results.push({
churchId: church.id,
name: church.name,
website: url,
country: church.country,
success: result.success,
massesFound: result.schedules.length,
schedules: result.schedules.map((s) => ({
dayOfWeek: s.dayOfWeek,
time: s.time,
massType: s.massType,
language: s.language,
})),
error: result.error,
});
if (result.success) {
console.log(`${result.schedules.length} masses found`);
for (const s of result.schedules) {
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
console.log(` ${days[s.dayOfWeek]} ${s.time} (${s.language || 'English'}${s.massType ? ', ' + s.massType : ''})`);
}
} else {
console.log(` ✗ No masses found: ${result.error}`);
}
} catch (error: any) {
console.log(` ✗ Error: ${error.message}`);
results.push({
churchId: church.id,
name: church.name,
website: url,
country: church.country,
success: false,
massesFound: 0,
error: error.message,
});
}
console.log('');
}
} finally {
// Always close the browser
await scraper.close();
}
// Summary
const successful = results.filter((r) => r.success);
const failed = results.filter((r) => !r.success);
const totalMasses = results.reduce((sum, r) => sum + r.massesFound, 0);
console.log('============================================================');
console.log('Test Summary');
console.log('============================================================');
console.log(`Total churches tested: ${results.length}`);
console.log(`Successful scrapes: ${successful.length} (${((successful.length / results.length) * 100).toFixed(1)}%)`);
console.log(`Failed scrapes: ${failed.length} (${((failed.length / results.length) * 100).toFixed(1)}%)`);
console.log(`Total masses found: ${totalMasses}`);
console.log('============================================================');
if (failed.length > 0) {
console.log('\nFailed websites:');
for (const f of failed) {
console.log(` - ${f.name}: ${f.website} (${f.error})`);
}
}
console.log('');
// Export results (without raw HTML to keep file manageable)
fs.writeFileSync(
'scraper-test-results.json',
JSON.stringify(results, null, 2)
);
console.log('Results saved to scraper-test-results.json');
return results;
}
async function main() {
const args = process.argv.slice(2);
const limitIndex = args.indexOf('--limit');
const countryIndex = args.indexOf('--country');
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 50;
const country = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
console.log('============================================================');
console.log('Website Scraper Testing');
console.log('============================================================');
console.log(`Limit: ${limit}`);
console.log(`Country: ${country || 'All'}`);
console.log('============================================================\n');
await testScrapers(limit, country);
await prisma.$disconnect();
await pool.end();
}
main().catch(console.error);

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env tsx
/**
* Verify Paróquia da Paz schedules are correctly parsed
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function verifyPazSchedules() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Verifying: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
console.log(`✅ Success: ${result.success}`);
console.log(`📅 Schedules found: ${result.schedules.length}\n`);
// Group by day
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
console.log('═══════════════════════════════════════════════');
console.log('PARSED SCHEDULE:');
console.log('═══════════════════════════════════════════════\n');
Object.entries(byDay)
.sort(([a], [b]) => parseInt(a) - parseInt(b))
.forEach(([day, scheds]) => {
console.log(`${dayNames[parseInt(day)]}:`);
scheds.forEach(s => {
console.log(` ${s.time} - ${s.language} ${s.massType}`);
});
console.log('');
});
console.log('Expected schedule (from website):');
console.log('Segunda, Terça, Quarta, Sexta: 16:00 e 18:00');
console.log('Quinta: 16:00 e 19:00');
console.log('Sábado: 08:00, 16:00 e 18:00');
console.log('Domingo: 08:00, 11:00, 16:00, 18:00 e 20:00');
await scraper.close();
}
verifyPazSchedules().catch(console.error);

97
scripts/dedup-churches.ts Normal file
View File

@@ -0,0 +1,97 @@
/**
* Find duplicate churches using ChromaDB semantic similarity.
*
* Usage:
* npx tsx scripts/dedup-churches.ts # Dry run, show duplicates
* npx tsx scripts/dedup-churches.ts --threshold 0.15 # Custom similarity threshold
* npx tsx scripts/dedup-churches.ts --country US # Only check US churches
* npx tsx scripts/dedup-churches.ts --limit 100 # Check first 100 churches
*/
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { findSimilarChurches } from '../src/chromadb/queries';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const args = process.argv.slice(2);
const threshold = args.includes('--threshold')
? parseFloat(args[args.indexOf('--threshold') + 1])
: 0.15; // Cosine distance threshold (lower = more similar)
const country = args.includes('--country')
? args[args.indexOf('--country') + 1]
: undefined;
const limit = args.includes('--limit')
? parseInt(args[args.indexOf('--limit') + 1])
: 500;
async function main() {
console.log(`Finding duplicate churches (threshold=${threshold}, country=${country || 'all'}, limit=${limit})`);
console.log('---');
const churches = await prisma.church.findMany({
take: limit,
where: country ? { country } : undefined,
orderBy: { name: 'asc' },
select: {
id: true,
name: true,
address: true,
city: true,
country: true,
source: true,
latitude: true,
longitude: true,
_count: { select: { massSchedules: true } },
},
});
console.log(`Checking ${churches.length} churches...\n`);
const seen = new Set<string>();
let duplicateCount = 0;
for (const church of churches) {
if (seen.has(church.id)) continue;
const text = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim();
const similar = await findSimilarChurches(text, {
country: church.country,
nResults: 5,
});
// Filter to matches within threshold, excluding self
const matches = similar.filter(
(s) => s.churchId !== church.id && s.distance <= threshold
);
if (matches.length > 0) {
duplicateCount++;
console.log(`\nPotential duplicate #${duplicateCount}:`);
console.log(` Original: "${church.name}" (${church.city || 'no city'}, ${church.country})`);
console.log(` ID: ${church.id}, Source: ${church.source}, Schedules: ${church._count.massSchedules}`);
console.log(` Lat/Lng: ${church.latitude}, ${church.longitude}`);
for (const match of matches) {
console.log(` Match: "${match.document}" (distance: ${match.distance.toFixed(4)})`);
console.log(` ID: ${match.churchId}`);
seen.add(match.churchId);
}
}
}
console.log(`\n---`);
console.log(`Found ${duplicateCount} potential duplicate groups from ${churches.length} churches`);
console.log(`Threshold: ${threshold} (lower = stricter matching)`);
await prisma.$disconnect();
await pool.end();
}
main().catch((err) => {
console.error(err);
process.exit(1);
});

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env tsx
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function main() {
const dryRun = !process.argv.includes('--execute');
if (dryRun) {
console.log('DRY RUN - pass --execute to actually delete duplicates\n');
}
const churches = await prisma.church.findMany({
where: { massSchedules: { some: { isActive: true } } },
include: { massSchedules: { where: { isActive: true }, orderBy: { createdAt: 'asc' } } },
});
let churchesFixed = 0;
let rowsDeleted = 0;
for (const church of churches) {
const seen = new Map<string, string>();
const toDelete: string[] = [];
for (const m of church.massSchedules) {
const key = `${m.dayOfWeek}:${m.time}:${m.language}`;
if (seen.has(key)) {
toDelete.push(m.id);
} else {
seen.set(key, m.id);
}
}
if (toDelete.length > 0) {
churchesFixed++;
rowsDeleted += toDelete.length;
if (!dryRun) {
await prisma.massSchedule.deleteMany({
where: { id: { in: toDelete } },
});
}
}
}
console.log(`Churches with duplicates: ${churchesFixed}`);
console.log(`Duplicate rows ${dryRun ? 'found' : 'deleted'}: ${rowsDeleted}`);
await prisma.$disconnect();
await pool.end();
}
main().catch((err) => {
console.error('Fatal error:', err);
process.exit(1);
});

27
scripts/deploy-to-nas.sh Executable file
View File

@@ -0,0 +1,27 @@
#!/bin/bash
set -e
NAS_HOST="albert@192.168.0.145"
NAS_PATH="/volume1/docker/scraper-control"
LOCAL_PATH="/Users/albert/Documents/Projects/Church/ScraperControl"
echo "Deploying ScraperControl to NAS..."
rsync -avz \
--exclude 'node_modules' \
--exclude '.next' \
--exclude '.git' \
--exclude '.env.local' \
--exclude '*.log' \
"$LOCAL_PATH/" \
"$NAS_HOST:$NAS_PATH/"
echo "Rebuilding containers..."
ssh "$NAS_HOST" << 'ENDSSH'
cd /volume1/docker/scraper-control
/usr/local/bin/docker compose build app scraper scheduler
/usr/local/bin/docker compose up -d scheduler freesearch-enrichment
/usr/local/bin/docker compose ps
/usr/local/bin/docker compose logs --tail 5 scheduler
ENDSSH
echo "Deployment complete!"

View File

@@ -0,0 +1,408 @@
#!/usr/bin/env tsx
/**
* Enrich OSM churches with Google Places data (website, phone, email)
*
* Usage:
* npx tsx scripts/enrich-with-google-places.ts --limit 10 --dry-run
* npx tsx scripts/enrich-with-google-places.ts --country BR --limit 100
* npx tsx scripts/enrich-with-google-places.ts --all
*
* Rate Limiting:
* - Free tier: $200/month credit
* - Text Search: ~$17 per 1000 requests
* - $200 / $17 = ~11,764 requests per month
* - ~390 churches per day to stay within free tier
* - Script uses 2-second delay between requests (max 1,800/hour)
*/
// Load .env for database connection
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
// Use DATABASE_URL from .env (works for both local dev and NAS/production)
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import axios from 'axios';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const GOOGLE_PLACES_API_KEY = process.env.GOOGLE_PLACES_API_KEY;
const PLACES_API_URL = 'https://places.googleapis.com/v1/places:searchText';
const RATE_LIMIT_MS = 2000; // 2 seconds between requests
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function createNewJob(config: Record<string, unknown>): Promise<string> {
const job = await prisma.backgroundJob.create({
data: {
type: 'google-enrichment',
status: 'running',
startedAt: new Date(),
config: config as any,
},
});
return job.id;
}
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { processed, succeeded, failed, itemsFound, totalItems },
});
}
async function checkJobStopping(jobId: string): Promise<boolean> {
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
return job?.status === 'stopping';
}
async function completeJob(jobId: string, error?: string): Promise<void> {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error,
completedAt: new Date(),
},
});
}
/**
* Country priority order — largest OSM church counts first, since those
* have the most un-enriched churches. Covers all countries from the
* CATHOLIC_COUNTRIES lists in import-osm-churches.ts.
*/
const COUNTRY_PRIORITY = [
// Top tier: 5000+ OSM churches
'FR', 'IT', 'ES', 'DE', 'PL', 'BR',
// High tier: 1000-5000
'PT', 'AT', 'BE', 'CZ', 'PH', 'HU', 'US', 'MX', 'HR', 'GB',
'CR', 'SK', 'EC', 'CH', 'AR', 'CA', 'CO', 'NL', 'IE', 'IN',
'SI', 'AU',
// Medium tier: 100-1000
'PE', 'RO', 'KR', 'CL', 'ID', 'LT', 'BO', 'VN', 'BA', 'BY',
'UA', 'VE', 'HN', 'UG', 'CD', 'GT', 'CU', 'SV', 'NI', 'PA',
'DO', 'CN', 'JP', 'LV', 'RS', 'TZ', 'KE', 'AL', 'RU',
// Lower tier: remaining countries
'LU', 'MT', 'NZ', 'PG', 'FJ', 'NC', 'PF', 'UY', 'PY', 'HT',
'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'AO', 'NG',
'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL',
'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO',
'TL', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'LK', 'BD', 'PK',
'LB', 'IL', 'PS', 'JO', 'SY', 'IQ',
'GF', 'SR', 'GY', 'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC',
'AG', 'DM', 'KN', 'MC', 'SM', 'VA', 'LI', 'AD',
'RS', 'MK', 'EE', 'GE', 'AM',
'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ',
'DJ', 'GM', 'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG',
'MN', 'BN', 'MV', 'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV',
'FM', 'MH', 'PW',
];
interface GooglePlacesResult {
found: boolean;
website?: string;
phone?: string;
placeId?: string;
}
interface EnrichmentStats {
processed: number;
enriched: number;
notFound: number;
errors: number;
websitesAdded: number;
phonesAdded: number;
}
async function searchGooglePlaces(
name: string,
city: string | null,
state: string | null,
latitude: number,
longitude: number
): Promise<GooglePlacesResult> {
if (!GOOGLE_PLACES_API_KEY) {
throw new Error('GOOGLE_PLACES_API_KEY not set in environment');
}
// Build search query
const location = [city, state].filter(Boolean).join(', ');
const textQuery = `${name} ${location}`.trim();
try {
const response = await axios.post(
PLACES_API_URL,
{
textQuery,
locationBias: {
circle: {
center: {
latitude,
longitude,
},
radius: 500, // 500 meters
},
},
},
{
headers: {
'Content-Type': 'application/json',
'X-Goog-Api-Key': GOOGLE_PLACES_API_KEY,
'X-Goog-FieldMask': 'places.id,places.displayName,places.websiteUri,places.nationalPhoneNumber',
},
}
);
if (response.data.places && response.data.places.length > 0) {
const place = response.data.places[0]; // Take first result
return {
found: true,
website: place.websiteUri || undefined,
phone: place.nationalPhoneNumber || undefined,
placeId: place.id || undefined,
};
}
return { found: false };
} catch (error: any) {
if (error.response?.status === 429) {
console.error('Rate limited by Google Places API');
throw new Error('RATE_LIMITED');
}
throw error;
}
}
async function enrichChurches(
countryCode?: string,
limit?: number,
dryRun: boolean = false,
jobId?: string | null
): Promise<EnrichmentStats> {
const stats: EnrichmentStats = {
processed: 0,
enriched: 0,
notFound: 0,
errors: 0,
websitesAdded: 0,
phonesAdded: 0,
};
let churches;
if (countryCode) {
// Manual override: process specific country
console.log(`Manual mode: Processing country ${countryCode}`);
churches = await prisma.church.findMany({
where: {
source: 'osm',
googleSearchedAt: null,
country: countryCode,
},
take: limit,
orderBy: { createdAt: 'asc' },
});
} else {
// Priority mode: sequential through countries (exhaust each before moving on)
console.log('Priority mode: Processing countries sequentially');
console.log(`Top priority countries: ${COUNTRY_PRIORITY.slice(0, 10).join(', ')}...\n`);
churches = [];
const targetTotal = limit || 390;
for (const country of COUNTRY_PRIORITY) {
if (churches.length >= targetTotal) break;
const remaining = targetTotal - churches.length;
const batch = await prisma.church.findMany({
where: {
source: 'osm',
googleSearchedAt: null,
country,
},
take: remaining,
orderBy: { createdAt: 'asc' },
});
if (batch.length > 0) {
churches.push(...batch);
console.log(` Queued ${batch.length} churches from ${country}`);
}
}
}
console.log(`\nFound ${churches.length} churches to enrich`);
console.log('');
for (const church of churches) {
stats.processed++;
try {
console.log(`[${stats.processed}/${churches.length}] ${church.name} (${church.city}, ${church.state})`);
const result = await searchGooglePlaces(
church.name,
church.city,
church.state,
church.latitude,
church.longitude
);
if (result.found) {
console.log(' ✓ Found on Google Places');
if (result.website) {
console.log(` Website: ${result.website}`);
stats.websitesAdded++;
}
if (result.phone) {
console.log(` Phone: ${result.phone}`);
stats.phonesAdded++;
}
if (!dryRun) {
await prisma.church.update({
where: { id: church.id },
data: {
website: result.website || church.website,
phone: result.phone || church.phone,
googlePlaceId: result.placeId || church.googlePlaceId,
hasWebsite: !!(result.website || church.website),
googleSearchedAt: new Date(),
},
});
if (result.website || result.phone) {
stats.enriched++;
}
}
} else {
console.log(' ✗ Not found on Google Places');
stats.notFound++;
// Mark as attempted so we don't re-query this church
if (!dryRun) {
await prisma.church.update({
where: { id: church.id },
data: { googleSearchedAt: new Date() },
});
}
}
// Rate limiting
await new Promise((resolve) => setTimeout(resolve, RATE_LIMIT_MS));
} catch (error: any) {
stats.errors++;
if (error.message === 'RATE_LIMITED') {
console.error(' ⚠ Rate limited, stopping enrichment');
break;
}
console.error(` ✗ Error: ${error.message}`);
}
// Job tracking: update progress every 10 items and check for stop
if (jobId && stats.processed % 10 === 0) {
await updateJobProgress(jobId, stats.processed, stats.enriched, stats.errors, stats.enriched, churches.length);
const stopping = await checkJobStopping(jobId);
if (stopping) {
console.log('\nJob stop requested via admin dashboard.');
break;
}
}
// Progress update every 50 churches
if (stats.processed % 50 === 0) {
console.log('');
console.log(`Progress: ${stats.processed}/${churches.length} processed`);
console.log(` Enriched: ${stats.enriched}, Not found: ${stats.notFound}, Errors: ${stats.errors}`);
console.log('');
}
}
// Final job update
if (jobId) {
await updateJobProgress(jobId, stats.processed, stats.enriched, stats.errors, stats.enriched, churches.length);
}
return stats;
}
async function main() {
const args = process.argv.slice(2);
const countryIndex = args.indexOf('--country');
const limitIndex = args.indexOf('--limit');
const dryRun = args.includes('--dry-run');
const all = args.includes('--all');
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
const limit = all ? undefined : limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 10;
if (!GOOGLE_PLACES_API_KEY) {
console.error('Error: GOOGLE_PLACES_API_KEY not set in environment');
console.error('Add it to your .env file');
process.exit(1);
}
console.log('============================================================');
console.log('Google Places Church Enrichment');
console.log('============================================================');
console.log(`Country: ${countryCode || 'All'}`);
console.log(`Limit: ${limit || 'No limit'}`);
console.log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
console.log('============================================================');
console.log('');
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId && !dryRun) {
jobId = await createNewJob({ countryCode, limit, dryRun });
}
if (jobId) console.log(`Job ID: ${jobId}\n`);
const stats = await enrichChurches(countryCode, limit, dryRun, jobId);
console.log('');
console.log('============================================================');
console.log('Enrichment Summary');
console.log('============================================================');
console.log(`Churches processed: ${stats.processed}`);
console.log(`Churches enriched: ${stats.enriched}`);
console.log(`Not found on Google: ${stats.notFound}`);
console.log(`Websites added: ${stats.websitesAdded}`);
console.log(`Phone numbers added: ${stats.phonesAdded}`);
console.log(`Errors encountered: ${stats.errors}`);
console.log('============================================================');
// Complete job
if (jobId) {
await completeJob(jobId);
}
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});

View File

@@ -0,0 +1,624 @@
#!/usr/bin/env tsx
/**
* Enrich churches with city/state/zip via Nominatim reverse geocoding (OSM)
*
* Usage:
* npx tsx scripts/enrich-with-reverse-geocode.ts --country FR --limit 10 --dry-run
* npx tsx scripts/enrich-with-reverse-geocode.ts --country FR --continuous
* npx tsx scripts/enrich-with-reverse-geocode.ts --continuous
*
* Rate limit: 1 request/second (Nominatim usage policy — mandatory).
* Full pass of ~193K churches in ~2 days.
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import axios from 'axios';
// Fresh DB connection (not cached singleton)
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const NOMINATIM_URL = 'https://nominatim.openstreetmap.org/reverse';
const RATE_LIMIT_MS = 1100; // Slightly over 1s to stay safe
const BATCH_SIZE = 50;
const PROGRESS_INTERVAL = 10;
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function createNewJob(config: Record<string, unknown>): Promise<string> {
const job = await prisma.backgroundJob.create({
data: {
type: 'reverse-geocode-enrichment',
status: 'running',
startedAt: new Date(),
config,
},
});
return job.id;
}
async function updateJobProgress(jobId: string, stats: EnrichmentStats, totalItems: number): Promise<void> {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
processed: stats.processed,
succeeded: stats.enriched,
failed: stats.errors,
itemsFound: stats.enriched,
totalItems,
},
});
}
async function checkJobStopping(jobId: string): Promise<boolean> {
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
return job?.status === 'stopping';
}
async function completeJob(jobId: string, error?: string): Promise<void> {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error,
completedAt: new Date(),
},
});
}
// --- Types ---
interface ChurchRecord {
id: string;
name: string;
address: string | null;
city: string | null;
state: string | null;
zip: string | null;
country: string;
latitude: number;
longitude: number;
}
interface NominatimAddress {
house_number?: string;
road?: string;
city?: string;
town?: string;
village?: string;
municipality?: string;
hamlet?: string;
suburb?: string;
neighbourhood?: string;
state?: string;
province?: string;
postcode?: string;
country_code?: string;
}
interface NominatimResponse {
display_name?: string;
address?: NominatimAddress;
error?: string;
}
interface EnrichmentStats {
processed: number;
enriched: number;
noCity: number;
errors: number;
skippedExisting: number;
cycles: number;
startTime: number;
}
// --- Circuit Breaker ---
class CircuitBreaker {
private failures = 0;
private isOpen = false;
private backoffMs = 60000; // Start at 60s for Nominatim
private readonly maxBackoffMs = 300000; // 5 minutes
private readonly threshold = 5;
async checkAndWait(): Promise<boolean> {
if (!this.isOpen) return true;
log(`Circuit breaker open. Waiting ${Math.round(this.backoffMs / 1000)}s before retry...`);
await sleep(this.backoffMs);
// Try a test request
try {
const resp = await axios.get(NOMINATIM_URL, {
params: { lat: 48.8566, lon: 2.3522, format: 'json' },
headers: { 'User-Agent': 'NearestMass/1.0 (privacy@nearestmass.com)' },
timeout: 10000,
});
if (resp.status === 200) {
this.reset();
log('Circuit breaker closed: Nominatim is back');
return true;
}
} catch {
// Still down
}
this.backoffMs = Math.min(this.backoffMs * 2, this.maxBackoffMs);
return false;
}
recordFailure() {
this.failures++;
if (this.failures >= this.threshold && !this.isOpen) {
this.isOpen = true;
this.backoffMs = 60000;
log(`Circuit breaker OPEN after ${this.failures} consecutive failures`);
}
}
reset() {
if (this.failures > 0 || this.isOpen) {
this.failures = 0;
this.isOpen = false;
this.backoffMs = 60000;
}
}
get opened() { return this.isOpen; }
}
// --- Helpers ---
let shuttingDown = false;
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function logError(msg: string) {
console.error(`[${new Date().toISOString()}] ${msg}`);
}
function sleep(ms: number): Promise<void> {
return new Promise(resolve => {
const timer = setTimeout(resolve, ms);
const check = setInterval(() => {
if (shuttingDown) {
clearTimeout(timer);
clearInterval(check);
resolve();
}
}, 1000);
setTimeout(() => clearInterval(check), ms + 100);
});
}
// --- Nominatim API ---
async function reverseGeocode(lat: number, lng: number): Promise<NominatimResponse> {
const response = await axios.get(NOMINATIM_URL, {
params: {
lat,
lon: lng,
format: 'json',
zoom: 16,
addressdetails: 1,
},
headers: {
'User-Agent': 'NearestMass/1.0 (privacy@nearestmass.com)',
'Accept-Language': 'en',
},
timeout: 15000,
});
return response.data;
}
function extractCity(address: NominatimAddress): string | null {
return address.city || address.town || address.village ||
address.municipality || address.hamlet || null;
}
function extractState(address: NominatimAddress): string | null {
return address.state || address.province || null;
}
function extractAddress(address: NominatimAddress): string | null {
const parts: string[] = [];
if (address.house_number) parts.push(address.house_number);
if (address.road) parts.push(address.road);
if (parts.length === 0) return null;
return parts.join(' ');
}
// --- Database Queries ---
async function getNextBatch(
batchSize: number,
countryCode?: string,
): Promise<ChurchRecord[]> {
return prisma.church.findMany({
where: {
city: null,
latitude: { not: undefined },
longitude: { not: undefined },
reverseGeocodedAt: null,
...(countryCode ? { country: countryCode } : {}),
},
select: {
id: true, name: true, address: true, city: true, state: true, zip: true,
country: true, latitude: true, longitude: true,
},
take: batchSize,
orderBy: [
{ country: 'asc' },
{ createdAt: 'asc' },
],
});
}
async function getTotalRemaining(countryCode?: string): Promise<number> {
return prisma.church.count({
where: {
city: null,
latitude: { not: undefined },
longitude: { not: undefined },
reverseGeocodedAt: null,
...(countryCode ? { country: countryCode } : {}),
},
});
}
// --- Main Processing ---
async function processChurch(
church: ChurchRecord,
stats: EnrichmentStats,
dryRun: boolean,
): Promise<void> {
const label = `${church.name} (${church.country})`;
try {
const result = await reverseGeocode(church.latitude, church.longitude);
if (result.error || !result.address) {
log(` - [${stats.processed}] ${label} => no address data`);
stats.noCity++;
if (!dryRun) {
await prisma.church.update({
where: { id: church.id },
data: { reverseGeocodedAt: new Date() },
});
}
return;
}
const address = extractAddress(result.address);
const city = extractCity(result.address);
const state = extractState(result.address);
const zip = result.address.postcode || null;
if (city) {
const addrStr = address ? `${address}, ` : '';
log(` + [${stats.processed}] ${label} => ${addrStr}${city}, ${state || '?'}`);
stats.enriched++;
} else {
log(` - [${stats.processed}] ${label} => no city in response`);
stats.noCity++;
}
if (!dryRun) {
const updateData: Record<string, unknown> = {
reverseGeocodedAt: new Date(),
};
// Only update fields that are currently null
if (address && !church.address) updateData.address = address;
if (city && !church.city) updateData.city = city;
if (state && !church.state) updateData.state = state;
if (zip && !church.zip) updateData.zip = zip;
// Update country if currently unknown (XX) and Nominatim returned one
const countryCodeResult = result.address.country_code?.toUpperCase();
if (church.country === 'XX' && countryCodeResult && countryCodeResult !== 'XX') {
updateData.country = countryCodeResult;
}
await prisma.church.update({
where: { id: church.id },
data: updateData,
});
}
} catch (error: any) {
stats.errors++;
// Handle rate limiting (429)
if (error.response?.status === 429) {
logError(` ! [${stats.processed}] ${label} => rate limited (429), backing off...`);
await sleep(5000); // Extra 5s backoff
throw error;
}
// Handle server errors (5xx)
if (error.response?.status >= 500) {
logError(` ! [${stats.processed}] ${label} => server error (${error.response.status})`);
throw error;
}
logError(` ! [${stats.processed}] ${label} => ${error.message}`);
// Don't throw for non-retriable errors (just mark as attempted)
if (!dryRun) {
await prisma.church.update({
where: { id: church.id },
data: { reverseGeocodedAt: new Date() },
});
}
}
}
async function runSinglePass(
stats: EnrichmentStats,
countryCode?: string,
limit?: number,
dryRun: boolean = false,
jobId?: string | null,
): Promise<void> {
let totalProcessed = 0;
const circuitBreaker = new CircuitBreaker();
while (!shuttingDown) {
if (limit && totalProcessed >= limit) break;
// Circuit breaker check
if (circuitBreaker.opened) {
const ok = await circuitBreaker.checkAndWait();
if (!ok) continue;
}
const batchLimit = limit
? Math.min(BATCH_SIZE, limit - totalProcessed)
: BATCH_SIZE;
const churches = await getNextBatch(batchLimit, countryCode);
if (churches.length === 0) break;
for (const church of churches) {
if (shuttingDown) break;
if (limit && totalProcessed >= limit) break;
stats.processed++;
totalProcessed++;
try {
await processChurch(church, stats, dryRun);
circuitBreaker.reset();
} catch (error: any) {
circuitBreaker.recordFailure();
// Already logged in processChurch
}
// Rate limit: 1 request per second
if (!shuttingDown) {
await sleep(RATE_LIMIT_MS);
}
// Job tracking: update progress every PROGRESS_INTERVAL items
if (jobId && stats.processed % PROGRESS_INTERVAL === 0) {
await updateJobProgress(jobId, stats, 0);
const stopping = await checkJobStopping(jobId);
if (stopping) {
log('Job stop requested via admin dashboard.');
shuttingDown = true;
break;
}
}
// Progress logging
if (stats.processed % 100 === 0) {
const elapsed = (Date.now() - stats.startTime) / 1000;
const rate = Math.round((stats.processed / elapsed) * 3600);
const enrichRate = stats.processed > 0
? ((stats.enriched / stats.processed) * 100).toFixed(1)
: '0.0';
log(`Progress: ${stats.processed} processed, ${stats.enriched} enriched, ${stats.noCity} no-city, ${stats.errors} errors`);
log(` Enrich rate: ${enrichRate}%, Rate: ~${rate}/hour`);
}
}
}
}
async function runContinuous(
stats: EnrichmentStats,
countryCode?: string,
jobId?: string | null,
): Promise<void> {
log('Running in continuous mode. Press Ctrl+C to stop.');
const circuitBreaker = new CircuitBreaker();
while (!shuttingDown) {
stats.cycles++;
log(`--- Cycle ${stats.cycles} ---`);
let processedInCycle = 0;
while (!shuttingDown) {
// Circuit breaker check
if (circuitBreaker.opened) {
const ok = await circuitBreaker.checkAndWait();
if (!ok) continue;
}
const churches = await getNextBatch(BATCH_SIZE, countryCode);
if (churches.length === 0) break;
for (const church of churches) {
if (shuttingDown) break;
stats.processed++;
processedInCycle++;
try {
await processChurch(church, stats, false);
circuitBreaker.reset();
} catch {
circuitBreaker.recordFailure();
}
// Rate limit
if (!shuttingDown) {
await sleep(RATE_LIMIT_MS);
}
// Job tracking
if (jobId && stats.processed % PROGRESS_INTERVAL === 0) {
await updateJobProgress(jobId, stats, 0);
const stopping = await checkJobStopping(jobId);
if (stopping) {
log('Job stop requested via admin dashboard.');
shuttingDown = true;
break;
}
}
// Progress logging
if (stats.processed % 100 === 0) {
const elapsed = (Date.now() - stats.startTime) / 1000;
const rate = Math.round((stats.processed / elapsed) * 3600);
log(`Progress: ${stats.processed} processed, ${stats.enriched} enriched, ${stats.noCity} no-city, ${stats.errors} errors (~${rate}/hour)`);
}
}
}
if (shuttingDown) break;
if (processedInCycle === 0) {
log('No churches needing reverse geocoding. Waiting 1 hour...');
for (let i = 0; i < 360 && !shuttingDown; i++) {
await sleep(10000);
}
} else {
log(`Cycle ${stats.cycles} complete. ${processedInCycle} churches processed. Brief pause...`);
await sleep(10000);
}
}
}
// --- Main ---
async function main() {
const args = process.argv.slice(2);
const countryIndex = args.indexOf('--country');
const limitIndex = args.indexOf('--limit');
const dryRun = args.includes('--dry-run');
const continuous = args.includes('--continuous');
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : undefined;
// Graceful shutdown
process.on('SIGTERM', () => {
log('Received SIGTERM, finishing current request...');
shuttingDown = true;
});
process.on('SIGINT', () => {
log('Received SIGINT, finishing current request...');
shuttingDown = true;
});
log('============================================================');
log('Nominatim Reverse Geocode Enrichment');
log('============================================================');
log(`Mode: ${continuous ? 'Continuous' : 'Single pass'}`);
log(`Country: ${countryCode || 'All'}`);
log(`Limit: ${limit || 'No limit'}`);
log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
log(`Rate limit: ${RATE_LIMIT_MS}ms between requests`);
log('============================================================');
// Count remaining
const remaining = await getTotalRemaining(countryCode);
log(`Churches needing reverse geocoding: ${remaining}`);
const estimatedHours = (remaining * RATE_LIMIT_MS / 1000 / 3600).toFixed(1);
log(`Estimated time: ~${estimatedHours} hours @ 1 req/sec`);
if (remaining === 0) {
log('Nothing to do!');
await prisma.$disconnect();
await pool.end();
return;
}
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId) {
jobId = await createNewJob({ countryCode, limit, continuous, dryRun });
}
log(`Job ID: ${jobId}`);
const stats: EnrichmentStats = {
processed: 0,
enriched: 0,
noCity: 0,
errors: 0,
skippedExisting: 0,
cycles: 0,
startTime: Date.now(),
};
if (continuous) {
await runContinuous(stats, countryCode, jobId);
} else {
await runSinglePass(stats, countryCode, limit, dryRun, jobId);
}
// Complete job
if (jobId) {
await updateJobProgress(jobId, stats, 0);
await completeJob(jobId);
}
// Print summary
const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1);
const enrichRate = stats.processed > 0
? ((stats.enriched / stats.processed) * 100).toFixed(1)
: '0.0';
log('');
log('============================================================');
log('Reverse Geocode Enrichment Summary');
log('============================================================');
log(`Churches processed: ${stats.processed}`);
log(`Cities found: ${stats.enriched}`);
log(`No city in response: ${stats.noCity}`);
log(`Errors: ${stats.errors}`);
log(`Enrich rate: ${enrichRate}%`);
log(`Elapsed: ${elapsed}s`);
if (stats.cycles > 0) {
log(`Cycles completed: ${stats.cycles}`);
}
log('============================================================');
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
logError(`Fatal error: ${error.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,328 @@
#!/usr/bin/env tsx
/**
* Enrich churches with website URLs from Wikidata
*
* Queries Wikidata SPARQL endpoint for Catholic churches that have official websites,
* then matches them to existing churches in the database via proximity + name matching.
*
* Usage:
* npx tsx scripts/enrich-with-wikidata.ts --dry-run
* npx tsx scripts/enrich-with-wikidata.ts --execute
* npx tsx scripts/enrich-with-wikidata.ts --execute --country DE
* npx tsx scripts/enrich-with-wikidata.ts --job-id <uuid>
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import axios from 'axios';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql';
const MATCH_RADIUS_KM = 1.0; // Max distance for matching
const BATCH_SIZE = 500; // SPARQL results per query
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function logError(msg: string) {
console.error(`[${new Date().toISOString()}] ${msg}`);
}
// Haversine distance in km
function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number {
const R = 6371;
const dLat = (lat2 - lat1) * Math.PI / 180;
const dLon = (lon2 - lon1) * Math.PI / 180;
const a = Math.sin(dLat / 2) ** 2 +
Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) *
Math.sin(dLon / 2) ** 2;
return R * 2 * Math.asin(Math.sqrt(a));
}
function normalizeForMatch(str: string): string {
return str.toLowerCase()
.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip accents
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
interface WikidataChurch {
label: string;
website: string;
lat: number;
lon: number;
wikidataId: string;
}
async function queryWikidata(country?: string, offset = 0): Promise<WikidataChurch[]> {
// SPARQL query for Catholic churches with websites
let countryFilter = '';
if (country) {
// Map ISO alpha-2 to Wikidata country item
const countryMap: Record<string, string> = {
DE: 'Q183', FR: 'Q142', ES: 'Q29', IT: 'Q38', PL: 'Q36',
PT: 'Q45', BR: 'Q155', NL: 'Q55', CZ: 'Q213', HU: 'Q28',
AT: 'Q40', BE: 'Q31', CH: 'Q39', IE: 'Q27', GB: 'Q145',
US: 'Q30', CA: 'Q16', MX: 'Q96', AR: 'Q414', CO: 'Q739',
HR: 'Q224', SK: 'Q214', SI: 'Q215',
};
const qid = countryMap[country];
if (qid) {
countryFilter = `?church wdt:P17 wd:${qid} .`;
}
}
const sparql = `
SELECT ?church ?churchLabel ?website ?lat ?lon WHERE {
?church wdt:P31/wdt:P279* wd:Q16970 .
?church wdt:P140 wd:Q9592 .
?church wdt:P856 ?website .
?church p:P625 ?coordStatement .
?coordStatement ps:P625 ?coord .
BIND(geof:latitude(?coord) AS ?lat)
BIND(geof:longitude(?coord) AS ?lon)
${countryFilter}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,de,fr,es,it,pt,pl,nl,cs,hu" . }
}
ORDER BY ?church
LIMIT ${BATCH_SIZE}
OFFSET ${offset}
`;
const response = await axios.get(WIKIDATA_SPARQL_URL, {
params: { query: sparql, format: 'json' },
headers: {
'User-Agent': 'NearestMass/1.0 (https://nearestmass.com; contact: privacy@nearestmass.com)',
'Accept': 'application/sparql-results+json',
},
timeout: 60000,
});
const bindings = response.data?.results?.bindings || [];
return bindings.map((b: any) => ({
label: b.churchLabel?.value || '',
website: b.website?.value || '',
lat: parseFloat(b.lat?.value || '0'),
lon: parseFloat(b.lon?.value || '0'),
wikidataId: b.church?.value?.replace('http://www.wikidata.org/entity/', '') || '',
}));
}
interface MatchResult {
churchId: string;
churchName: string;
distance: number;
nameScore: number;
}
async function findMatch(wdChurch: WikidataChurch): Promise<MatchResult | null> {
// Find nearby churches without a website
const nearby = await prisma.church.findMany({
where: {
website: null,
latitude: { gte: wdChurch.lat - 0.01, lte: wdChurch.lat + 0.01 },
longitude: { gte: wdChurch.lon - 0.01, lte: wdChurch.lon + 0.01 },
},
select: { id: true, name: true, latitude: true, longitude: true },
take: 20,
});
if (nearby.length === 0) return null;
// Score each candidate
const wdNameNorm = normalizeForMatch(wdChurch.label);
const wdWords = wdNameNorm.split(' ').filter(w => w.length >= 3);
let bestMatch: MatchResult | null = null;
for (const church of nearby) {
const dist = haversineKm(wdChurch.lat, wdChurch.lon, church.latitude, church.longitude);
if (dist > MATCH_RADIUS_KM) continue;
const churchNameNorm = normalizeForMatch(church.name);
const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3);
// Count matching words
let matchingWords = 0;
for (const w of wdWords) {
if (churchWords.includes(w)) matchingWords++;
}
const nameScore = wdWords.length > 0 ? matchingWords / wdWords.length : 0;
// Require at least 50% word overlap or distance < 100m
if (nameScore < 0.5 && dist > 0.1) continue;
if (!bestMatch || nameScore > bestMatch.nameScore ||
(nameScore === bestMatch.nameScore && dist < bestMatch.distance)) {
bestMatch = {
churchId: church.id,
churchName: church.name,
distance: dist,
nameScore,
};
}
}
return bestMatch;
}
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function main() {
const args = process.argv.slice(2);
const dryRun = !args.includes('--execute');
const countryIdx = args.indexOf('--country');
const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
log('============================================================');
log('Wikidata Church Website Enrichment');
log('============================================================');
log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`);
log(`Country: ${country || 'All'}`);
log('============================================================');
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId && !dryRun) {
const job = await prisma.backgroundJob.create({
data: {
type: 'wikidata-enrichment',
status: 'running',
startedAt: new Date(),
config: { country, dryRun },
},
});
jobId = job.id;
log(`Job ID: ${jobId}`);
}
let totalFetched = 0;
let matched = 0;
let updated = 0;
let noMatch = 0;
let alreadyHasWebsite = 0;
let offset = 0;
try {
while (true) {
log(`Querying Wikidata (offset ${offset})...`);
const results = await queryWikidata(country, offset);
if (results.length === 0) {
log('No more results from Wikidata.');
break;
}
totalFetched += results.length;
log(`Fetched ${results.length} churches from Wikidata (total: ${totalFetched})`);
for (const wdChurch of results) {
if (!wdChurch.website || !wdChurch.lat || !wdChurch.lon) continue;
const match = await findMatch(wdChurch);
if (!match) {
noMatch++;
continue;
}
matched++;
log(` Match: "${wdChurch.label}" (${wdChurch.wikidataId}) -> "${match.churchName}" (dist: ${match.distance.toFixed(3)}km, score: ${match.nameScore.toFixed(2)})`);
if (!dryRun) {
await prisma.church.update({
where: { id: match.churchId },
data: {
website: wdChurch.website,
hasWebsite: true,
},
});
updated++;
}
}
// Rate limit SPARQL queries
await new Promise(r => setTimeout(r, 2000));
offset += BATCH_SIZE;
// Update job progress
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
processed: totalFetched,
succeeded: updated,
itemsFound: matched,
},
});
// Check for stop
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
if (job?.status === 'stopping') {
log('Job stop requested.');
break;
}
}
}
} catch (error: any) {
logError(`Error: ${error.message}`);
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'failed', error: error.message, completedAt: new Date() },
});
}
throw error;
}
// Complete job
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'completed', completedAt: new Date(), processed: totalFetched, succeeded: updated, itemsFound: matched },
});
}
log('');
log('============================================================');
log('Wikidata Enrichment Summary');
log('============================================================');
log(`Wikidata churches fetched: ${totalFetched}`);
log(`Matched to DB churches: ${matched}`);
log(`Websites updated: ${updated}`);
log(`No match found: ${noMatch}`);
log(`Already had website: ${alreadyHasWebsite}`);
log('============================================================');
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
logError(`Fatal error: ${error.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,623 @@
#!/usr/bin/env tsx
/**
* Second-pass matching: analyze stored ChromaDB search results to find websites
* that the FreeSearch first pass missed.
*
* Usage:
* npx tsx scripts/match-search-results.ts --dry-run
* npx tsx scripts/match-search-results.ts --country IT --limit 100
* npx tsx scripts/match-search-results.ts --threshold 0.3
*
* Algorithm:
* 1. Get churches without websites that have been FreeSearch'd
* 2. Query ChromaDB search_results collection for semantically similar results
* 3. Cross-church matching: URLs from nearby churches may match
* 4. URL frequency analysis: URLs appearing for multiple churches in same area
* 5. Verify best candidates against page content
* 6. Update church.website if verified
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { Collection } from 'chromadb';
import axios from 'axios';
import { getCollection, COLLECTION_NAMES } from '../src/chromadb/collections';
import { embedSingle } from '../src/chromadb/embeddings';
// Fresh DB connection
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function createNewJob(config: Record<string, unknown>): Promise<string> {
const job = await prisma.backgroundJob.create({
data: {
type: 'match-search-results',
status: 'running',
startedAt: new Date(),
config,
},
});
return job.id;
}
async function updateJobProgress(jobId: string, processed: number, found: number, total: number): Promise<void> {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { processed, succeeded: found, totalItems: total },
});
}
async function checkJobStopping(jobId: string): Promise<boolean> {
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
return job?.status === 'stopping';
}
async function completeJob(jobId: string, error?: string): Promise<void> {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error,
completedAt: new Date(),
},
});
}
// --- Types ---
interface ChurchRecord {
id: string;
name: string;
address: string | null;
city: string | null;
state: string | null;
country: string;
latitude: number;
longitude: number;
}
interface MatchStats {
processed: number;
matched: number;
noResults: number;
verifyFailed: number;
errors: number;
startTime: number;
}
// --- Helpers ---
let shuttingDown = false;
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function logError(msg: string) {
console.error(`[${new Date().toISOString()}] ${msg}`);
}
function normalizeForMatch(str: string): string {
return str.toLowerCase()
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
const CATHOLIC_KEYWORDS = [
'parish', 'church', 'catholic', 'parroquia', 'paroisse', 'pfarrei',
'parafia', 'paroquia', 'parrocchia', 'farnost', 'plebania', 'parochie',
'župnija', 'farnosť', 'iglesia', 'église', 'kirche', 'kościół',
'chiesa', 'kostel', 'templom', 'kerk',
];
const MASS_SCHEDULE_KEYWORDS = [
'mass schedule', 'mass times', 'worship schedule', 'worship times',
'service times', 'sunday mass', 'weekday mass',
'horario de misas', 'horarios de misa', 'horaires des messes',
'gottesdienst', 'gottesdienstzeiten', 'messzeiten',
'msze święte', 'godziny mszy', 'msze św',
'orari delle messe', 'orario messe',
'horário das missas',
];
const TOURISM_KEYWORDS = [
'tourism', 'turismo', 'tourisme', 'turisme', 'touristik', 'turistico',
'attractions', 'things to do', 'sightseeing', 'sehenswürdigkeiten',
'what to see', 'places to visit', 'travel guide', 'reiseführer',
'patrimoine', 'heritage trail', 'cultural heritage',
'punto de interés', 'point of interest', 'points of interest',
];
function getSignificantWords(name: string): string[] {
const stopWords = new Set([
'the', 'of', 'and', 'in', 'at', 'for', 'our', 'lady',
'st', 'saint', 'saints', 'san', 'sant', 'santa', 'santo', 'sacred',
'christ', 'jesus', 'mary', 'maria', 'king', 'lord', 'heart',
'cross', 'lady', 'queen', 'angel', 'angels', 'good', 'star',
'nome', 'pere', 'madre', 'notre', 'dame', 'bien',
'onze', 'lieve', 'vrouw', 'heer',
'rosa', 'paul', 'anne', 'jean', 'joan', 'luke', 'marc',
'rita', 'jose', 'leon', 'pius', 'roch', 'yves', 'ines',
'vita', 'fara', 'bona',
'cristo', 'fatima', 'lourdes', 'perpetuo', 'socorro', 'calvario',
'rosario', 'pilar', 'carmen', 'dolores', 'remedios', 'nieves',
'grotte', 'mission', 'sagrada', 'sagrado', 'familia',
'guadalupe', 'assumption', 'immaculate', 'perpetual', 'divine',
'knights', 'columbus',
'house', 'home', 'hall', 'center', 'centre', 'centro',
'deacon', 'priest', 'bishop', 'father', 'sister', 'brother',
'school', 'academy', 'college', 'seminary', 'rectory', 'retreat',
'church', 'parish', 'catholic', 'roman', 'holy', 'chapel',
'cathedral', 'basilica', 'shrine', 'convent', 'monastery',
'chapelle', 'eglise', 'église', 'paroisse', 'couvent', 'grotte',
'iglesia', 'parroquia', 'capilla', 'ermita', 'convento', 'basílica',
'kirche', 'kapelle', 'pfarrei', 'kloster',
'chiesa', 'parrocchia', 'cappella', 'oratorio',
'igreja', 'capela', 'paroquia',
'kościół', 'kaplica', 'parafia', 'droga',
'kostel', 'kaple', 'farnost', 'templom', 'kápolna',
'de', 'la', 'le', 'les', 'du', 'des', 'el', 'los', 'las',
'di', 'del', 'della', 'delle', 'degli',
'do', 'da', 'dos', 'das',
'und', 'der', 'die', 'das', 'von',
'nad', 'pod', 'przy',
]);
return normalizeForMatch(name)
.split(' ')
.filter(w => w.length >= 3 && !stopWords.has(w));
}
function stripHtml(html: string): string {
return html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/&[a-z]+;/gi, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
}
// --- URL Verification (same logic as enrich-with-freesearch.ts) ---
async function verifyUrl(url: string, church: ChurchRecord): Promise<boolean> {
try {
const response = await axios.get(url, {
timeout: 10000,
maxRedirects: 3,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; NearestMass/1.0; +https://nearestmass.com)',
'Accept': 'text/html',
},
maxContentLength: 200000,
responseType: 'text',
});
if (typeof response.data !== 'string') return false;
const text = stripHtml(response.data);
const nameWords = getSignificantWords(church.name);
let nameMatches = 0;
for (const word of nameWords) {
if (text.includes(word)) nameMatches++;
}
let cityMatch = false;
if (church.city) {
const cityNorm = normalizeForMatch(church.city);
if (cityNorm.length > 2 && text.includes(cityNorm)) cityMatch = true;
}
let addressMatch = false;
if (church.address) {
const addrNorm = normalizeForMatch(church.address);
const addrWords = addrNorm.split(' ').filter(w => w.length >= 4 && !/^\d+$/.test(w));
let addrWordMatches = 0;
for (const w of addrWords) {
if (text.includes(w)) addrWordMatches++;
}
if (addrWordMatches >= 2) addressMatch = true;
}
let hasCatholicKeyword = false;
for (const kw of CATHOLIC_KEYWORDS) {
if (text.includes(kw)) { hasCatholicKeyword = true; break; }
}
let hasMassSchedule = false;
for (const kw of MASS_SCHEDULE_KEYWORDS) {
if (text.includes(kw)) { hasMassSchedule = true; break; }
}
let isTourismPage = false;
for (const kw of TOURISM_KEYWORDS) {
if (text.includes(kw)) { isTourismPage = true; break; }
}
let domainMatchesName = false;
try {
const hostname = new URL(url).hostname.toLowerCase();
for (const word of nameWords) {
if (word.length >= 4 && hostname.includes(word)) {
domainMatchesName = true;
break;
}
}
} catch { /* ignore */ }
if (isTourismPage && !hasMassSchedule) return false;
let isDeepUrl = false;
try {
const pathSegments = new URL(url).pathname.split('/').filter(Boolean);
isDeepUrl = pathSegments.length > 2;
} catch { /* ignore */ }
if (isDeepUrl && !domainMatchesName && !hasMassSchedule) return false;
const hasCity = !!(church.city && church.city.trim());
if (hasMassSchedule && nameMatches >= 1) return true;
if (domainMatchesName && nameMatches >= 1 && hasCatholicKeyword) return true;
if (hasCity) {
if (nameMatches >= 2) return true;
if (nameMatches >= 1 && cityMatch) return true;
if (nameMatches >= 1 && addressMatch) return true;
}
if (!hasCity) {
if (nameMatches >= 1 && addressMatch) return true;
if (nameMatches >= 3) return true;
}
return false;
} catch {
return false;
}
}
// --- ChromaDB Querying ---
interface ChromaResult {
id: string;
url: string;
title: string;
score: number;
distance: number;
churchId: string;
churchName: string;
churchCity: string;
verified?: boolean;
}
async function findCandidatesForChurch(
church: ChurchRecord,
collection: Collection,
threshold: number,
nResults: number
): Promise<ChromaResult[]> {
// Build identity text for semantic search
const identityText = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim();
const queryEmbedding = await embedSingle(identityText);
const results = await collection.query({
queryEmbeddings: [queryEmbedding],
nResults,
where: { churchCountry: church.country },
});
if (!results.ids[0]) return [];
return results.ids[0]
.map((id, i) => {
const metadata = results.metadatas[0][i] as Record<string, unknown>;
return {
id,
url: (metadata.resultUrl as string) || '',
title: (metadata.resultTitle as string) || '',
score: (metadata.score as number) || 0,
distance: results.distances?.[0]?.[i] ?? 1,
churchId: (metadata.churchId as string) || '',
churchName: (metadata.churchName as string) || '',
churchCity: (metadata.churchCity as string) || '',
verified: (metadata.verified as boolean) || false,
};
})
.filter(r => r.distance <= threshold && r.url);
}
function deduplicateByUrl(results: ChromaResult[]): ChromaResult[] {
const seen = new Map<string, ChromaResult>();
for (const r of results) {
const existing = seen.get(r.url);
if (!existing || r.distance < existing.distance) {
seen.set(r.url, r);
}
}
return [...seen.values()].sort((a, b) => a.distance - b.distance);
}
// --- Main Processing ---
async function processChurch(
church: ChurchRecord,
collection: Collection,
stats: MatchStats,
threshold: number,
dryRun: boolean
): Promise<void> {
const label = `${church.name} (${church.city || 'unknown'}, ${church.country})`;
try {
// 1. Semantic search for similar results in ChromaDB
const candidates = await findCandidatesForChurch(church, collection, threshold, 20);
if (candidates.length === 0) {
log(` - ${label} => no ChromaDB results within threshold`);
stats.noResults++;
return;
}
// 2. Separate results: own church vs cross-church
const ownResults = candidates.filter(r => r.churchId === church.id);
const crossResults = candidates.filter(r => r.churchId !== church.id);
// 3. URL frequency: URLs appearing for multiple churches are likely real parish/diocese sites
const urlFrequency = new Map<string, number>();
for (const r of candidates) {
urlFrequency.set(r.url, (urlFrequency.get(r.url) || 0) + 1);
}
// 4. Prioritize: already-verified URLs from other churches, then high-frequency URLs,
// then own-church results, then cross-church results
const verifiedFromOthers = crossResults.filter(r => r.verified);
const highFreqUrls = [...urlFrequency.entries()]
.filter(([, count]) => count >= 2)
.map(([url]) => url);
// Build candidate list in priority order
const urlsToTry: string[] = [];
const addUrl = (url: string) => {
if (!urlsToTry.includes(url)) urlsToTry.push(url);
};
// Verified URLs from nearby churches (highest priority)
for (const r of verifiedFromOthers) addUrl(r.url);
// High-frequency URLs (appear in results for multiple churches)
for (const url of highFreqUrls) addUrl(url);
// Own church results by distance (closest semantic match first)
const dedupedOwn = deduplicateByUrl(ownResults);
for (const r of dedupedOwn) addUrl(r.url);
// Cross-church results from same city
const sameCityCross = crossResults.filter(r =>
church.city && r.churchCity &&
normalizeForMatch(r.churchCity) === normalizeForMatch(church.city)
);
const dedupedCross = deduplicateByUrl(sameCityCross);
for (const r of dedupedCross) addUrl(r.url);
// Limit to top 5 candidates
const topUrls = urlsToTry.slice(0, 5);
log(` ? ${label} => ${candidates.length} results, trying ${topUrls.length} candidates`);
// 5. Verify each candidate
let verifiedUrl: string | null = null;
for (const url of topUrls) {
const ok = await verifyUrl(url, church);
if (ok) {
verifiedUrl = url;
break;
} else {
stats.verifyFailed++;
}
}
if (verifiedUrl) {
log(` + ${label} => ${verifiedUrl}`);
stats.matched++;
if (!dryRun) {
await prisma.church.update({
where: { id: church.id },
data: {
website: verifiedUrl,
hasWebsite: true,
},
});
// Mark in ChromaDB (update replaces metadata, so include all fields)
try {
const matchingResult = candidates.find(r => r.url === verifiedUrl);
if (matchingResult) {
await collection.update({
ids: [matchingResult.id],
metadatas: [{
churchId: matchingResult.churchId,
churchName: matchingResult.churchName,
churchCity: matchingResult.churchCity,
churchCountry: church.country,
searchQuery: '',
resultUrl: verifiedUrl,
resultTitle: matchingResult.title || '',
score: matchingResult.score || 0,
verified: true,
}],
});
}
} catch { /* ignore */ }
}
} else {
log(` ~ ${label} => ${topUrls.length} candidates failed verification`);
stats.noResults++;
}
} catch (error: any) {
stats.errors++;
logError(` ! ${label} => error: ${error.message}`);
}
}
// --- Main ---
async function main() {
const args = process.argv.slice(2);
const countryIndex = args.indexOf('--country');
const limitIndex = args.indexOf('--limit');
const thresholdIndex = args.indexOf('--threshold');
const dryRun = args.includes('--dry-run');
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 500;
const threshold = thresholdIndex !== -1 ? parseFloat(args[thresholdIndex + 1]) : 0.4;
// Graceful shutdown
process.on('SIGTERM', () => { log('Received SIGTERM'); shuttingDown = true; });
process.on('SIGINT', () => { log('Received SIGINT'); shuttingDown = true; });
log('============================================================');
log('Second-Pass Search Result Matching');
log('============================================================');
log(`Country: ${countryCode || 'All'}`);
log(`Limit: ${limit}`);
log(`Threshold: ${threshold}`);
log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
log('============================================================');
// Connect to ChromaDB
let collection: Collection;
try {
collection = await getCollection(COLLECTION_NAMES.SEARCH_RESULTS);
log('ChromaDB search_results collection connected');
} catch (e: any) {
logError(`ChromaDB unavailable: ${e.message}`);
logError('This script requires ChromaDB. Make sure it is running.');
process.exit(1);
}
// Check collection has data
const count = await collection.count();
log(`ChromaDB search_results: ${count} entries`);
if (count === 0) {
log('No search results stored yet. Run enrich-with-freesearch.ts first.');
process.exit(0);
}
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId) {
jobId = await createNewJob({ countryCode, limit, threshold, dryRun });
}
log(`Job ID: ${jobId}`);
// Get churches without websites that have been FreeSearch'd
const whereClause: Record<string, unknown> = {
source: 'osm',
website: null,
freeSearchedAt: { not: null },
};
if (countryCode) {
(whereClause as any).country = countryCode;
}
const churches = await prisma.church.findMany({
where: whereClause as any,
select: {
id: true, name: true, address: true, city: true, state: true,
country: true, latitude: true, longitude: true,
},
take: limit,
orderBy: { updatedAt: 'asc' },
});
log(`Found ${churches.length} churches without websites (already FreeSearch'd)`);
const stats: MatchStats = {
processed: 0,
matched: 0,
noResults: 0,
verifyFailed: 0,
errors: 0,
startTime: Date.now(),
};
for (const church of churches) {
if (shuttingDown) break;
stats.processed++;
await processChurch(church, collection, stats, threshold, dryRun);
// Job tracking every 10 items
if (jobId && stats.processed % 10 === 0) {
await updateJobProgress(jobId, stats.processed, stats.matched, churches.length);
const stopping = await checkJobStopping(jobId);
if (stopping) {
log('Job stop requested via admin dashboard.');
shuttingDown = true;
break;
}
}
// Progress logging every 50 items
if (stats.processed % 50 === 0) {
const elapsed = (Date.now() - stats.startTime) / 1000;
const rate = Math.round((stats.processed / elapsed) * 3600);
log(`Progress: ${stats.processed}/${churches.length} processed, ${stats.matched} matched, ${stats.noResults} no match, ${stats.errors} errors (~${rate}/hour)`);
}
}
// Complete job
if (jobId) {
await updateJobProgress(jobId, stats.processed, stats.matched, churches.length);
await completeJob(jobId);
}
// Print summary
const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1);
const matchRate = stats.processed > 0
? ((stats.matched / stats.processed) * 100).toFixed(1)
: '0.0';
log('');
log('============================================================');
log('Second-Pass Matching Summary');
log('============================================================');
log(`Churches processed: ${stats.processed}`);
log(`Websites matched: ${stats.matched}`);
log(`No match found: ${stats.noResults}`);
log(`Verify rejected: ${stats.verifyFailed}`);
log(`Errors: ${stats.errors}`);
log(`Match rate: ${matchRate}%`);
log(`Elapsed: ${elapsed}s`);
log('============================================================');
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
logError(`Fatal error: ${error.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,110 @@
/**
* Normalize country codes in the database.
* Converts full country names to ISO 3166-1 alpha-2 codes.
*
* Usage:
* npx tsx scripts/normalize-country-codes.ts --dry-run
* npx tsx scripts/normalize-country-codes.ts --execute
*/
import path from 'path';
import dotenv from 'dotenv';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { normalizeCountryCode } from '../src/lib/country-normalize';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function main() {
const dryRun = !process.argv.includes('--execute');
if (dryRun) {
console.log('DRY RUN — no changes will be made. Use --execute to apply.\n');
}
// Get all distinct country values
const countries = await prisma.church.findMany({
select: { country: true },
distinct: ['country'],
where: { country: { not: null } },
});
const countryValues = countries
.map(c => c.country)
.filter((c): c is string => c !== null);
console.log(`Found ${countryValues.length} distinct country values.\n`);
// Group by normalization result
const changes: { original: string; normalized: string; count?: number }[] = [];
const alreadyNormalized: string[] = [];
const unknown: string[] = [];
for (const country of countryValues) {
const normalized = normalizeCountryCode(country);
if (normalized === country) {
// Already correct or unknown
if (country.length === 2 && country === country.toUpperCase()) {
alreadyNormalized.push(country);
} else {
unknown.push(country);
}
} else {
changes.push({ original: country, normalized });
}
}
// Get counts for changes
for (const change of changes) {
const count = await prisma.church.count({
where: { country: change.original },
});
change.count = count;
}
// Report
console.log(`Already normalized (${alreadyNormalized.length}): ${alreadyNormalized.sort().join(', ')}\n`);
if (changes.length > 0) {
console.log(`Changes to apply (${changes.length}):`);
for (const { original, normalized, count } of changes) {
console.log(` "${original}" → "${normalized}" (${count} churches)`);
}
console.log();
} else {
console.log('No changes needed — all country values are already normalized.\n');
}
if (unknown.length > 0) {
console.log(`Unknown values (${unknown.length}): ${unknown.join(', ')}`);
console.log(' These could not be mapped to ISO codes. Review manually.\n');
}
// Apply changes
if (!dryRun && changes.length > 0) {
let totalUpdated = 0;
for (const { original, normalized } of changes) {
const result = await prisma.church.updateMany({
where: { country: original },
data: { country: normalized },
});
totalUpdated += result.count;
console.log(`Updated "${original}" → "${normalized}": ${result.count} churches`);
}
console.log(`\nTotal updated: ${totalUpdated} churches`);
}
await prisma.$disconnect();
await pool.end();
}
main().catch(err => {
console.error('Error:', err);
process.exit(1);
});

View File

@@ -0,0 +1,197 @@
/**
* Bulk-populate ChromaDB collections from the database.
*
* Usage:
* npx tsx scripts/populate-chromadb.ts --collection church_identity
* npx tsx scripts/populate-chromadb.ts --collection page_classification
* npx tsx scripts/populate-chromadb.ts --all
* npx tsx scripts/populate-chromadb.ts --all --batch-size 50 --limit 1000
*/
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { getCollection, COLLECTION_NAMES, CollectionName } from '../src/chromadb/collections';
import { embed } from '../src/chromadb/embeddings';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const args = process.argv.slice(2);
const collectionArg = args.includes('--collection')
? args[args.indexOf('--collection') + 1]
: null;
const populateAll = args.includes('--all');
const batchSize = args.includes('--batch-size')
? parseInt(args[args.indexOf('--batch-size') + 1])
: 100;
const limit = args.includes('--limit')
? parseInt(args[args.indexOf('--limit') + 1])
: 0;
async function populateChurchIdentity() {
console.log('\n=== Populating church_identity ===');
const collection = await getCollection(COLLECTION_NAMES.CHURCH_IDENTITY);
const totalCount = await prisma.church.count();
const maxItems = limit > 0 ? Math.min(limit, totalCount) : totalCount;
console.log(`Total churches: ${totalCount}, processing: ${maxItems}`);
let processed = 0;
let cursor: string | undefined = undefined;
while (processed < maxItems) {
const currentBatch = Math.min(batchSize, maxItems - processed);
const churches = await prisma.church.findMany({
take: currentBatch,
...(cursor ? { skip: 1, cursor: { id: cursor } } : {}),
orderBy: { id: 'asc' },
select: {
id: true,
name: true,
address: true,
city: true,
country: true,
source: true,
latitude: true,
longitude: true,
},
});
if (churches.length === 0) break;
const documents = churches.map(
(c) => `${c.name} ${c.address || ''} ${c.city || ''} ${c.country}`.trim()
);
const embeddings = await embed(documents);
await collection.upsert({
ids: churches.map((c) => `church-${c.id}`),
embeddings,
documents,
metadatas: churches.map((c) => ({
churchId: c.id,
country: c.country,
source: c.source,
lat: c.latitude,
lng: c.longitude,
})),
});
processed += churches.length;
cursor = churches[churches.length - 1].id;
console.log(` Processed ${processed}/${maxItems}`);
}
console.log(` Done: ${processed} churches indexed`);
}
async function populatePageClassification() {
console.log('\n=== Populating page_classification ===');
const collection = await getCollection(COLLECTION_NAMES.PAGE_CLASSIFICATION);
// Index churches that have been successfully scraped (have mass schedules)
const totalCount = await prisma.church.count({
where: {
lastScrapedAt: { not: null },
massSchedules: { some: { isActive: true } },
},
});
const maxItems = limit > 0 ? Math.min(limit, totalCount) : totalCount;
console.log(`Scraped churches with schedules: ${totalCount}, processing: ${maxItems}`);
let processed = 0;
let cursor: string | undefined = undefined;
while (processed < maxItems) {
const currentBatch = Math.min(batchSize, maxItems - processed);
const churches = await prisma.church.findMany({
take: currentBatch,
...(cursor ? { skip: 1, cursor: { id: cursor } } : {}),
where: {
lastScrapedAt: { not: null },
massSchedules: { some: { isActive: true } },
},
orderBy: { id: 'asc' },
select: {
id: true,
massScheduleUrl: true,
website: true,
websiteLanguage: true,
scraperConfig: { select: { rawHtml: true } },
},
});
if (churches.length === 0) break;
// Use stored raw HTML (truncated) as the document
const validChurches = churches.filter((c) => c.scraperConfig?.rawHtml);
if (validChurches.length > 0) {
const documents = validChurches.map(
(c) => (c.scraperConfig?.rawHtml || '').slice(0, 2000)
);
const embeddings = await embed(documents);
await collection.upsert({
ids: validChurches.map((c) => `page-${c.id}`),
embeddings,
documents,
metadatas: validChurches.map((c) => ({
url: c.massScheduleUrl || c.website || '',
isMassSchedulePage: true,
language: c.websiteLanguage || 'unknown',
})),
});
}
processed += churches.length;
cursor = churches[churches.length - 1].id;
console.log(` Processed ${processed}/${maxItems} (${validChurches.length} had raw HTML)`);
}
console.log(` Done: ${processed} pages classified`);
}
async function main() {
try {
if (!populateAll && !collectionArg) {
console.log('Usage:');
console.log(' npx tsx scripts/populate-chromadb.ts --collection church_identity');
console.log(' npx tsx scripts/populate-chromadb.ts --collection page_classification');
console.log(' npx tsx scripts/populate-chromadb.ts --all');
console.log(' npx tsx scripts/populate-chromadb.ts --all --batch-size 50 --limit 1000');
process.exit(0);
}
const collectionsToPopulate: CollectionName[] = populateAll
? [COLLECTION_NAMES.CHURCH_IDENTITY, COLLECTION_NAMES.PAGE_CLASSIFICATION]
: [collectionArg as CollectionName];
for (const name of collectionsToPopulate) {
switch (name) {
case COLLECTION_NAMES.CHURCH_IDENTITY:
await populateChurchIdentity();
break;
case COLLECTION_NAMES.PAGE_CLASSIFICATION:
await populatePageClassification();
break;
default:
console.log(`Collection '${name}' does not have a populate function yet.`);
console.log('Available: church_identity, page_classification');
}
}
console.log('\nPopulation complete!');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
main();

View File

@@ -0,0 +1,54 @@
import { config } from 'dotenv';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
// Load environment variables
config({ path: '.env.local' });
config({ path: '.env' });
// Create connection pool
const connectionString = process.env.DATABASE_URL || '';
const pool = new Pool({ connectionString });
// Create Prisma adapter
const adapter = new PrismaPg(pool);
// Create Prisma client with adapter
const prisma = new PrismaClient({
adapter,
log: ['error'],
});
async function main() {
console.log('Populating cityNormalized field using SQL...');
// Use raw SQL for much faster batch update
// Normalize: lowercase, remove special chars except spaces/numbers, trim
const result = await prisma.$executeRaw`
UPDATE churches
SET city_normalized = LOWER(
TRIM(
REGEXP_REPLACE(
COALESCE(city, ''),
'[^a-zA-Z0-9 ]',
'',
'g'
)
)
)
WHERE city IS NOT NULL
`;
console.log(`✅ Updated ${result} churches with normalized cities`);
}
main()
.then(async () => {
await prisma.$disconnect();
})
.catch(async (e) => {
console.error(e);
await prisma.$disconnect();
process.exit(1);
});

View File

@@ -0,0 +1,161 @@
#!/usr/bin/env tsx
/**
* Save mass schedules to database using scrapeChurch() service
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { scrapeChurch } from '../src/lib/scraper-service';
import { prisma } from '../src/lib/db';
const PRIORITY_COUNTRIES = ['FR', 'DE', 'ES', 'PL', 'BR'];
const CHURCHES_PER_COUNTRY = 5; // Start small to verify it works
interface ScrapeResult {
churchId: string;
churchName: string;
country: string;
success: boolean;
schedulesCreated: number;
error?: string;
}
async function saveSchedulesToDb() {
console.log('Starting database save operation...\n');
console.log(`Target: ${CHURCHES_PER_COUNTRY} churches per country`);
console.log(`Countries: ${PRIORITY_COUNTRIES.join(', ')}\n`);
const results: ScrapeResult[] = [];
let totalChurches = 0;
let totalSuccess = 0;
let totalSchedules = 0;
for (const country of PRIORITY_COUNTRIES) {
console.log(`\n${'='.repeat(60)}`);
console.log(`${country} - Finding churches to scrape...`);
console.log('='.repeat(60));
// Get churches with websites that haven't been scraped yet
const churches = await prisma.church.findMany({
where: {
country,
website: { not: null },
source: 'osm',
lastScrapedAt: null, // Only unscrapped churches
},
take: CHURCHES_PER_COUNTRY,
orderBy: { createdAt: 'asc' },
});
console.log(`Found ${churches.length} churches to scrape\n`);
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
totalChurches++;
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
try {
// Use the scrapeChurch service which saves to database
const result = await scrapeChurch(church.id);
if (result.success) {
totalSuccess++;
totalSchedules += result.schedulesCreated;
process.stdout.write(`${result.schedulesCreated} schedules saved\n`);
results.push({
churchId: church.id,
churchName: church.name,
country,
success: true,
schedulesCreated: result.schedulesCreated,
});
} else {
process.stdout.write(`${result.error}\n`);
results.push({
churchId: church.id,
churchName: church.name,
country,
success: false,
schedulesCreated: 0,
error: result.error,
});
}
} catch (err: any) {
process.stdout.write(`❌ ERROR: ${err.message}\n`);
results.push({
churchId: church.id,
churchName: church.name,
country,
success: false,
schedulesCreated: 0,
error: err.message,
});
}
}
}
// Final summary
console.log('\n\n');
console.log('═'.repeat(80));
console.log('DATABASE SAVE SUMMARY');
console.log('═'.repeat(80));
console.log('');
console.log(`Total churches processed: ${totalChurches}`);
console.log(`Successful scrapes: ${totalSuccess} (${((totalSuccess / totalChurches) * 100).toFixed(1)}%)`);
console.log(`Total schedules saved to database: ${totalSchedules}`);
console.log('');
// Verify database records
console.log('Verifying database records...\n');
const dbScheduleCount = await prisma.massSchedule.count();
const dbChurchesWithSchedules = await prisma.church.count({
where: {
massSchedules: {
some: {},
},
},
});
console.log(`✓ Total mass schedules in database: ${dbScheduleCount}`);
console.log(`✓ Churches with schedules: ${dbChurchesWithSchedules}`);
console.log('');
// Show sample of saved schedules
console.log('Sample of saved schedules:\n');
const sampleChurches = await prisma.church.findMany({
where: {
massSchedules: {
some: {},
},
},
include: {
massSchedules: {
take: 3,
orderBy: { dayOfWeek: 'asc' },
},
},
take: 3,
});
const dayNames = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
sampleChurches.forEach(church => {
console.log(`${church.name} (${church.country}):`);
church.massSchedules.forEach(schedule => {
console.log(` ${dayNames[schedule.dayOfWeek]} ${schedule.time} - ${schedule.language} ${schedule.massType || ''}`);
});
console.log('');
});
await prisma.$disconnect();
}
saveSchedulesToDb().catch(console.error);

299
scripts/scrape-churches.ts Normal file
View File

@@ -0,0 +1,299 @@
#!/usr/bin/env tsx
/**
* Bulk church website scraper
* Scrapes mass schedules from church websites and updates the database.
*
* Usage:
* npx tsx scripts/scrape-churches.ts --limit 100
* npx tsx scripts/scrape-churches.ts --limit 50 --max-failures 3
* npx tsx scripts/scrape-churches.ts --all # Process ALL eligible churches
* npx tsx scripts/scrape-churches.ts --all --language english
* npx tsx scripts/scrape-churches.ts --all --max-failures 3
* npx tsx scripts/scrape-churches.ts --ids id1,id2,id3
* npx tsx scripts/scrape-churches.ts --all --job-id <uuid> # Resume/track existing job
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { scrapeAllChurches, scrapeChurch, countEligibleChurches } from '../src/lib/scraper-service';
import type { ScrapeJobResult } from '../src/lib/scraper-service';
// Fresh DB connection for scripts
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const jobPrisma = new PrismaClient({ adapter });
let shuttingDown = false;
function formatDuration(seconds: number): string {
if (seconds < 60) return `${seconds.toFixed(0)}s`;
if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.floor(seconds % 60)}s`;
const h = Math.floor(seconds / 3600);
const m = Math.floor((seconds % 3600) / 60);
return `${h}h ${m}m`;
}
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await jobPrisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function createNewJob(language: string | null, config: Record<string, unknown>): Promise<string> {
const job = await jobPrisma.backgroundJob.create({
data: {
type: 'scraper',
language: language || 'generic',
status: 'running',
startedAt: new Date(),
config,
},
});
return job.id;
}
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
await jobPrisma.backgroundJob.update({
where: { id: jobId },
data: { processed, succeeded, failed, itemsFound, totalItems },
});
}
async function checkJobStopping(jobId: string): Promise<boolean> {
const job = await jobPrisma.backgroundJob.findUnique({ where: { id: jobId } });
return job?.status === 'stopping';
}
async function completeJob(jobId: string, error?: string): Promise<void> {
await jobPrisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error,
completedAt: new Date(),
},
});
}
async function main() {
const args = process.argv.slice(2);
const limitIndex = args.indexOf('--limit');
const maxFailIndex = args.indexOf('--max-failures');
const idsIndex = args.indexOf('--ids');
const allMode = args.includes('--all');
const langIndex = args.indexOf('--language');
const maxFailures = maxFailIndex !== -1 ? parseInt(args[maxFailIndex + 1]) : 5;
const ids = idsIndex !== -1 ? args[idsIndex + 1].split(',') : null;
const language = langIndex !== -1 ? args[langIndex + 1] : null;
// --ids mode: scrape specific churches
if (ids) {
console.log('============================================================');
console.log('Church Website Scraper — Targeted Mode');
console.log('============================================================');
console.log(`Targeting ${ids.length} specific churches`);
console.log(`Max failures: ${maxFailures}`);
console.log(`Started: ${new Date().toISOString()}`);
console.log('============================================================\n');
const startTime = Date.now();
const results = await Promise.all(ids.map((id) => scrapeChurch(id.trim())));
printSummary(results, startTime);
return;
}
// --all mode: batch loop through ALL eligible churches
if (allMode) {
const BATCH_SIZE = 100;
const totalEligible = await countEligibleChurches(maxFailures);
console.log('============================================================');
console.log('Church Website Scraper — Full Run');
console.log('============================================================');
console.log(`Language: ${language || 'all'}`);
console.log(`Eligible churches: ${totalEligible.toLocaleString()}`);
console.log(`Batch size: ${BATCH_SIZE}`);
console.log(`Max failures: ${maxFailures}`);
console.log(`Started: ${new Date().toISOString()}`);
console.log('============================================================\n');
if (totalEligible === 0) {
console.log('No eligible churches to scrape. All done!');
return;
}
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId) {
jobId = await createNewJob(language, { allMode: true, maxFailures, language });
}
console.log(`Job ID: ${jobId}\n`);
// Graceful shutdown handlers
process.on('SIGINT', () => {
if (shuttingDown) {
console.log('\nForce quit.');
process.exit(1);
}
console.log('\nShutting down gracefully (finishing current batch)...');
shuttingDown = true;
});
process.on('SIGTERM', () => {
console.log('\nSIGTERM received, shutting down after current batch...');
shuttingDown = true;
});
const allResults: ScrapeJobResult[] = [];
const globalStart = Date.now();
let batchNum = 0;
let totalSchedulesFound = 0;
try {
while (!shuttingDown) {
batchNum++;
const batchStart = Date.now();
const batchResults = await scrapeAllChurches({ limit: BATCH_SIZE, maxFailures, language: language || undefined });
if (batchResults.length === 0) {
console.log('\nNo more eligible churches. All done!');
break;
}
allResults.push(...batchResults);
// Batch summary
const batchElapsed = (Date.now() - batchStart) / 1000;
const batchSuccess = batchResults.filter((r) => r.success).length;
const batchSchedules = batchResults.reduce((sum, r) => sum + r.schedulesFound, 0);
totalSchedulesFound += batchSchedules;
// Overall progress
const totalElapsed = (Date.now() - globalStart) / 1000;
const rate = allResults.length / (totalElapsed / 3600);
const remaining = totalEligible - allResults.length;
const etaSeconds = remaining > 0 && rate > 0 ? (remaining / rate) * 3600 : 0;
console.log(`\n--- Batch ${batchNum} (${batchResults.length} churches) ---`);
console.log(` Success: ${batchSuccess}/${batchResults.length} | Schedules: ${batchSchedules} | Time: ${formatDuration(batchElapsed)}`);
console.log(` Progress: ${allResults.length.toLocaleString()}/${totalEligible.toLocaleString()} (${((allResults.length / totalEligible) * 100).toFixed(1)}%)`);
console.log(` Rate: ${rate.toFixed(0)}/hr | ETA: ~${formatDuration(etaSeconds)}`);
// Update job progress
const succeeded = allResults.filter(r => r.success).length;
const failed = allResults.filter(r => !r.success).length;
await updateJobProgress(jobId, allResults.length, succeeded, failed, totalSchedulesFound, totalEligible);
// Check if job was requested to stop (every 10 items)
if (allResults.length % 10 === 0) {
const stopping = await checkJobStopping(jobId);
if (stopping) {
console.log('\nJob stop requested via admin dashboard.');
shuttingDown = true;
}
}
if (shuttingDown) {
console.log('\nGraceful shutdown: batch completed.');
break;
}
}
await completeJob(jobId);
} catch (error) {
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
throw error;
}
printSummary(allResults, globalStart);
return;
}
// Default mode: single batch with --limit
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 100;
console.log('============================================================');
console.log('Church Website Scraper');
console.log('============================================================');
console.log(`Language: ${language || 'all'}`);
console.log(`Limit: ${limit}`);
console.log(`Max failures: ${maxFailures}`);
console.log(`Started: ${new Date().toISOString()}`);
console.log('============================================================\n');
// Job tracking for single batch mode too
let jobId = await createOrResumeJob(args);
if (!jobId) {
jobId = await createNewJob(language, { limit, maxFailures, language });
}
console.log(`Job ID: ${jobId}\n`);
const startTime = Date.now();
try {
const results = await scrapeAllChurches({ limit, maxFailures, language: language || undefined });
const succeeded = results.filter(r => r.success).length;
const failed = results.filter(r => !r.success).length;
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
await updateJobProgress(jobId, results.length, succeeded, failed, totalSchedules, limit);
await completeJob(jobId);
printSummary(results, startTime);
} catch (error) {
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
throw error;
}
}
function printSummary(results: ScrapeJobResult[], startTime: number) {
const elapsed = (Date.now() - startTime) / 1000;
const succeeded = results.filter((r) => r.success);
const failed = results.filter((r) => !r.success);
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
const rate = results.length / (elapsed / 3600);
console.log('\n============================================================');
console.log('Scraping Summary');
console.log('============================================================');
console.log(`Churches processed: ${results.length.toLocaleString()}`);
console.log(`Succeeded: ${succeeded.length.toLocaleString()}`);
console.log(`Failed: ${failed.length.toLocaleString()}`);
console.log(`Total schedules found: ${totalSchedules.toLocaleString()}`);
console.log(`Elapsed time: ${formatDuration(elapsed)}`);
console.log(`Average rate: ${rate.toFixed(0)}/hr`);
console.log(`Finished: ${new Date().toISOString()}`);
console.log('============================================================');
if (failed.length > 0) {
console.log(`\nFailed churches (${failed.length}):`);
// Show first 50 failures to avoid overwhelming output
const toShow = failed.slice(0, 50);
for (const f of toShow) {
console.log(` - ${f.churchName}: ${f.error}`);
}
if (failed.length > 50) {
console.log(` ... and ${failed.length - 50} more`);
}
}
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
}).finally(async () => {
await jobPrisma.$disconnect();
await pool.end();
});

View File

@@ -0,0 +1,372 @@
#!/usr/bin/env tsx
/**
* Scrape diocese directories to discover parish URLs and mass schedules
*
* Usage:
* npx tsx scripts/scrape-diocese-directory.ts --diocese <id> # Single diocese
* npx tsx scripts/scrape-diocese-directory.ts --country DE # All dioceses in country
* npx tsx scripts/scrape-diocese-directory.ts --all # All active dioceses
* npx tsx scripts/scrape-diocese-directory.ts --all --dry-run # Preview only
* npx tsx scripts/scrape-diocese-directory.ts --job-id <uuid> # Resume tracked job
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function logError(msg: string) {
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
}
// Haversine distance in km
function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number {
const R = 6371;
const dLat = (lat2 - lat1) * Math.PI / 180;
const dLon = (lon2 - lon1) * Math.PI / 180;
const a = Math.sin(dLat / 2) ** 2 +
Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) *
Math.sin(dLon / 2) ** 2;
return R * 2 * Math.asin(Math.sqrt(a));
}
function normalizeForMatch(str: string): string {
return str.toLowerCase()
.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
.replace(/[^a-z0-9\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
interface MatchCandidate {
id: string;
name: string;
latitude: number;
longitude: number;
distance: number;
nameScore: number;
}
async function findMatchingChurch(
name: string,
address: string | undefined,
city: string | undefined,
country: string,
): Promise<MatchCandidate | null> {
// Search by name similarity + country
const nameNorm = normalizeForMatch(name);
const nameWords = nameNorm.split(' ').filter(w => w.length >= 3);
if (nameWords.length === 0) return null;
// Find churches in the same country
const candidates = await prisma.church.findMany({
where: {
country,
...(city ? { city: { contains: city, mode: 'insensitive' } } : {}),
},
select: { id: true, name: true, latitude: true, longitude: true, website: true },
take: 50,
});
let bestMatch: MatchCandidate | null = null;
for (const church of candidates) {
const churchNameNorm = normalizeForMatch(church.name);
const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3);
let matchingWords = 0;
for (const w of nameWords) {
if (churchWords.includes(w)) matchingWords++;
}
const nameScore = nameWords.length > 0 ? matchingWords / nameWords.length : 0;
// Require at least 40% word overlap
if (nameScore < 0.4) continue;
if (!bestMatch || nameScore > bestMatch.nameScore) {
bestMatch = {
id: church.id,
name: church.name,
latitude: church.latitude,
longitude: church.longitude,
distance: 0,
nameScore,
};
}
}
return bestMatch;
}
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function scrapeDiocese(
dioceseId: string,
dryRun: boolean,
stats: { processed: number; matched: number; created: number; schedules: number; errors: number }
): Promise<void> {
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
if (!diocese) {
logError(`Diocese not found: ${dioceseId}`);
return;
}
if (!diocese.directoryUrl) {
log(` Skipping ${diocese.name}: no directory URL`);
return;
}
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
if (!config?.selectors) {
log(` Skipping ${diocese.name}: no scrape config`);
return;
}
log(`Scraping diocese: ${diocese.name} (${diocese.country})`);
log(` Directory URL: ${diocese.directoryUrl}`);
const scraper = new DioceseDirectoryScraper();
try {
let parishes;
if (config.scheduleInDirectory) {
parishes = await scraper.scrapeDirectoryWithSchedules(
diocese.directoryUrl,
config,
diocese.language
);
} else {
const discovered = await scraper.scrapeDirectory(diocese.directoryUrl, config);
parishes = discovered.map(p => ({
...p,
scheduleText: '',
schedules: [] as Array<{ dayOfWeek: number; time: string; massType?: string; language?: string; notes?: string }>,
}));
}
log(` Discovered ${parishes.length} parishes`);
for (const parish of parishes) {
stats.processed++;
// Try to match to existing church
const match = await findMatchingChurch(
parish.name,
parish.address,
parish.city,
diocese.country,
);
if (match) {
stats.matched++;
log(` Match: "${parish.name}" -> "${match.name}" (score: ${match.nameScore.toFixed(2)})`);
if (!dryRun) {
// Update matched church with website and diocese link
await prisma.church.update({
where: { id: match.id },
data: {
website: parish.url,
hasWebsite: true,
dioceseId: diocese.id,
},
});
// Save schedules if available
if ('schedules' in parish && parish.schedules.length > 0) {
await prisma.massSchedule.deleteMany({ where: { churchId: match.id } });
await prisma.massSchedule.createMany({
data: parish.schedules.map(s => ({
churchId: match.id,
dayOfWeek: s.dayOfWeek,
time: s.time,
massType: s.massType,
language: s.language ?? 'English',
notes: s.notes,
})),
});
stats.schedules += parish.schedules.length;
}
}
} else {
log(` No match: "${parish.name}" (${parish.city || 'no city'})`);
stats.created++;
// In non-dry-run, we could create new churches, but for safety
// we only log unmatched parishes for manual review
// (Creating churches from directory data without coordinates is risky)
}
}
// Update diocese tracking
if (!dryRun) {
await prisma.diocese.update({
where: { id: diocese.id },
data: {
lastScrapedAt: new Date(),
lastSuccessAt: new Date(),
churchCount: parishes.length,
failureCount: 0,
},
});
}
} catch (err: any) {
stats.errors++;
logError(` Failed to scrape ${diocese.name}: ${err.message}`);
if (!dryRun) {
await prisma.diocese.update({
where: { id: diocese.id },
data: {
lastScrapedAt: new Date(),
lastFailureAt: new Date(),
failureCount: { increment: 1 },
},
});
}
} finally {
await scraper.close();
}
}
async function main() {
const args = process.argv.slice(2);
const dryRun = args.includes('--dry-run');
const dioceseIdx = args.indexOf('--diocese');
const countryIdx = args.indexOf('--country');
const all = args.includes('--all');
const dioceseId = dioceseIdx !== -1 ? args[dioceseIdx + 1] : undefined;
const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
log('============================================================');
log('Diocese Directory Scraper');
log('============================================================');
log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`);
log(`Target: ${dioceseId ? `Diocese ${dioceseId}` : country ? `Country ${country}` : 'All active'}`);
log('============================================================');
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId && !dryRun) {
const job = await prisma.backgroundJob.create({
data: {
type: 'diocese-directory',
status: 'running',
startedAt: new Date(),
config: { dioceseId, country, all, dryRun },
},
});
jobId = job.id;
log(`Job ID: ${jobId}`);
}
const stats = { processed: 0, matched: 0, created: 0, schedules: 0, errors: 0 };
try {
let dioceses;
if (dioceseId) {
dioceses = [{ id: dioceseId }];
} else {
dioceses = await prisma.diocese.findMany({
where: {
active: true,
directoryUrl: { not: null },
...(country ? { country } : {}),
},
select: { id: true, name: true },
orderBy: { name: 'asc' },
});
}
log(`Found ${dioceses.length} dioceses to scrape`);
for (const d of dioceses) {
await scrapeDiocese(d.id, dryRun, stats);
// Check for job stop
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { processed: stats.processed, succeeded: stats.matched, itemsFound: stats.matched },
});
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
if (job?.status === 'stopping') {
log('Job stop requested.');
break;
}
}
}
} catch (error: any) {
logError(`Fatal error: ${error.message}`);
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'failed', error: error.message, completedAt: new Date() },
});
}
throw error;
}
// Complete job
if (jobId) {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: 'completed',
completedAt: new Date(),
processed: stats.processed,
succeeded: stats.matched,
itemsFound: stats.matched,
},
});
}
log('');
log('============================================================');
log('Diocese Directory Scraper Summary');
log('============================================================');
log(`Parishes discovered: ${stats.processed}`);
log(`Matched to DB: ${stats.matched}`);
log(`Unmatched (new): ${stats.created}`);
log(`Schedules saved: ${stats.schedules}`);
log(`Errors: ${stats.errors}`);
log('============================================================');
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
logError(`Fatal error: ${error.message}`);
process.exit(1);
});

171
scripts/scrape-masstimes.ts Normal file
View File

@@ -0,0 +1,171 @@
import 'dotenv/config';
import { prisma } from '../src/lib/db';
import { MassTimesScraper, ChurchData } from '../src/lib/masstimes-scraper';
const TARGET_STATES = [
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL',
'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME',
'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH',
'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI',
'WY',
];
function deduplicateMassSchedules<T extends { dayOfWeek: number; time: string; language: string }>(schedules: T[]): T[] {
const seen = new Map<string, T>();
for (const s of schedules) {
const key = `${s.dayOfWeek}:${s.time}:${s.language}`;
if (!seen.has(key)) {
seen.set(key, s);
}
}
return Array.from(seen.values());
}
async function saveChurch(data: ChurchData, seenIds: Set<string>): Promise<boolean> {
if (seenIds.has(data.masstimesId)) {
console.log(` Skipping duplicate: ${data.name}`);
return false;
}
try {
await prisma.$transaction(async (tx) => {
const church = await tx.church.upsert({
where: { masstimesId: data.masstimesId },
create: {
masstimesId: data.masstimesId,
name: data.name,
address: data.address,
city: data.city,
state: data.state,
zip: data.zip,
country: data.country,
latitude: data.latitude,
longitude: data.longitude,
phone: data.phone,
website: data.website,
email: data.email,
pastorName: data.pastorName,
diocese: data.diocese,
directions: data.directions,
wheelchairAccess: data.wheelchairAccess,
lastScrapedAt: new Date(),
scrapeStrategy: 'masstimes',
},
update: {
name: data.name,
address: data.address,
city: data.city,
state: data.state,
zip: data.zip,
latitude: data.latitude,
longitude: data.longitude,
phone: data.phone,
website: data.website,
email: data.email,
pastorName: data.pastorName,
diocese: data.diocese,
directions: data.directions,
wheelchairAccess: data.wheelchairAccess,
lastScrapedAt: new Date(),
},
});
await tx.massSchedule.deleteMany({ where: { churchId: church.id } });
await tx.confessionSchedule.deleteMany({ where: { churchId: church.id } });
await tx.adorationSchedule.deleteMany({ where: { churchId: church.id } });
if (data.massSchedules.length > 0) {
await tx.massSchedule.createMany({
data: deduplicateMassSchedules(data.massSchedules).map((ms) => ({
churchId: church.id,
dayOfWeek: ms.dayOfWeek,
time: ms.time,
massType: ms.massType,
language: ms.language,
notes: ms.notes,
})),
});
}
if (data.confessionSchedules.length > 0) {
await tx.confessionSchedule.createMany({
data: data.confessionSchedules.map((cs) => ({
churchId: church.id,
dayOfWeek: cs.dayOfWeek,
startTime: cs.startTime,
endTime: cs.endTime,
notes: cs.notes,
})),
});
}
if (data.adorationSchedules.length > 0) {
await tx.adorationSchedule.createMany({
data: data.adorationSchedules.map((as) => ({
churchId: church.id,
dayOfWeek: as.dayOfWeek,
startTime: as.startTime,
endTime: as.endTime,
isPerpetual: as.isPerpetual,
notes: as.notes,
})),
});
}
});
seenIds.add(data.masstimesId);
console.log(` Saved: ${data.name}`);
return true;
} catch (error) {
console.error(` Error saving ${data.name}:`, error);
return false;
}
}
async function main() {
const seenIds = new Set<string>();
console.log('\n' + '='.repeat(70));
console.log('MASSTIMES.ORG CHURCH SCRAPER (JSON API)');
console.log('='.repeat(70));
console.log(`\nTarget states: ${TARGET_STATES.length}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('\n' + '-'.repeat(70));
const scraper = new MassTimesScraper();
const stats = { total: 0, saved: 0, errors: 0 };
try {
await scraper.init();
console.log('Browser initialized\n');
for (let i = 0; i < TARGET_STATES.length; i++) {
const state = TARGET_STATES[i];
console.log(`\n[${'='.repeat(20)}] SCRAPING ${state} [${'='.repeat(20)}]\n`);
console.log(`State ${i + 1}/${TARGET_STATES.length}: ${state}`);
const churches = await scraper.scrapeState(state);
stats.total += churches.length;
console.log(`\n Saving ${churches.length} churches from ${state} to database...`);
for (const church of churches) {
const saved = await saveChurch(church, seenIds);
if (saved) stats.saved++;
else stats.errors++;
}
console.log(`\n Resting 5 minutes before next state...\n`);
await new Promise(resolve => setTimeout(resolve, 300000));
}
} finally {
await scraper.close();
await prisma.$disconnect();
}
console.log('\n' + '='.repeat(70));
console.log('SUMMARY');
console.log('='.repeat(70));
console.log(`Total scraped: ${stats.total}`);
console.log(`Saved: ${stats.saved}`);
console.log(`Errors: ${stats.errors}`);
console.log('='.repeat(70) + '\n');
}
main().catch(console.error);

328
scripts/setup-diocese.ts Executable file
View File

@@ -0,0 +1,328 @@
#!/usr/bin/env tsx
/**
* Interactive helper to configure a new diocese for scraping
*
* Usage:
* npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de
* npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr
* npx tsx scripts/setup-diocese.ts --list # List all configured dioceses
* npx tsx scripts/setup-diocese.ts --test <diocese-id> # Test scraping a diocese
*/
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
import readline from 'readline';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
function log(msg: string) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
function logError(msg: string) {
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
}
function ask(question: string): Promise<string> {
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
return new Promise(resolve => {
rl.question(question, answer => {
rl.close();
resolve(answer.trim());
});
});
}
async function listDioceses() {
const dioceses = await prisma.diocese.findMany({
orderBy: [{ country: 'asc' }, { name: 'asc' }],
});
if (dioceses.length === 0) {
log('No dioceses configured yet.');
return;
}
console.log('\nConfigured Dioceses:');
console.log('─'.repeat(100));
console.log(
'ID'.padEnd(38) +
'Name'.padEnd(30) +
'Country'.padEnd(10) +
'Active'.padEnd(8) +
'Churches'.padEnd(10) +
'Last Scraped'
);
console.log('─'.repeat(100));
for (const d of dioceses) {
console.log(
d.id.padEnd(38) +
d.name.substring(0, 28).padEnd(30) +
d.country.padEnd(10) +
(d.active ? 'Yes' : 'No').padEnd(8) +
String(d.churchCount).padEnd(10) +
(d.lastScrapedAt ? d.lastScrapedAt.toISOString().split('T')[0] : 'Never')
);
}
console.log('─'.repeat(100));
console.log(`Total: ${dioceses.length} dioceses`);
}
async function testDiocese(dioceseId: string) {
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
if (!diocese) {
logError(`Diocese not found: ${dioceseId}`);
return;
}
if (!diocese.directoryUrl) {
logError(`Diocese ${diocese.name} has no directory URL`);
return;
}
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
if (!config?.selectors) {
logError(`Diocese ${diocese.name} has no scrape config`);
return;
}
log(`Testing diocese: ${diocese.name}`);
log(`Directory URL: ${diocese.directoryUrl}`);
log('');
const scraper = new DioceseDirectoryScraper();
try {
const parishes = await scraper.scrapeDirectory(diocese.directoryUrl, config);
log(`\nDiscovered ${parishes.length} parishes:\n`);
for (const p of parishes.slice(0, 10)) {
console.log(` ${p.name}`);
console.log(` URL: ${p.url}`);
if (p.address) console.log(` Address: ${p.address}`);
if (p.city) console.log(` City: ${p.city}`);
console.log('');
}
if (parishes.length > 10) {
console.log(` ... and ${parishes.length - 10} more`);
}
} finally {
await scraper.close();
}
}
async function setupDiocese(url: string, country: string, language: string) {
log(`Setting up diocese from: ${url}`);
log(`Country: ${country}, Language: ${language}`);
// Ask for diocese name
const name = await ask('\nDiocese name (e.g. "Bistum Mainz"): ');
if (!name) {
logError('Name is required');
return;
}
// Check if already exists
const existing = await prisma.diocese.findFirst({
where: { name, country },
});
if (existing) {
logError(`Diocese "${name}" already exists in ${country} (ID: ${existing.id})`);
return;
}
// Probe the page structure
log('\nProbing page structure...');
const scraper = new DioceseDirectoryScraper();
await scraper.init();
try {
const page = (scraper as any).page;
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
// Analyze page - count links and common patterns
const analysis = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a'));
const linkPatterns: Record<string, number> = {};
for (const link of links) {
const href = link.href;
if (!href) continue;
// Extract pattern from URL path
try {
const path = new URL(href).pathname;
const segments = path.split('/').filter(Boolean);
if (segments.length >= 1) {
const pattern = '/' + segments.slice(0, -1).join('/') + '/*';
linkPatterns[pattern] = (linkPatterns[pattern] || 0) + 1;
}
} catch { /* ignore */ }
}
// Find most common list-like elements
const listSelectors = [
'ul li', 'ol li', 'div.parish', 'div.item', 'article',
'tr', '.card', '.entry', '.listing', '.result',
];
const selectorCounts: Record<string, number> = {};
for (const sel of listSelectors) {
selectorCounts[sel] = document.querySelectorAll(sel).length;
}
return {
title: document.title,
totalLinks: links.length,
linkPatterns: Object.entries(linkPatterns)
.sort(([, a], [, b]) => b - a)
.slice(0, 10),
selectorCounts,
bodyTextLength: document.body?.textContent?.length || 0,
};
});
console.log(`\nPage: ${analysis.title}`);
console.log(`Total links: ${analysis.totalLinks}`);
console.log(`\nMost common link patterns:`);
for (const [pattern, count] of analysis.linkPatterns) {
console.log(` ${pattern}: ${count} links`);
}
console.log(`\nElement counts:`);
for (const [sel, count] of Object.entries(analysis.selectorCounts)) {
if (count > 0) console.log(` ${sel}: ${count}`);
}
// Ask for selectors
console.log('\nNow configure CSS selectors for this diocese.\n');
const parishList = await ask('Parish list container selector (e.g. "ul.parishes li", ".parish-item"): ');
const parishLink = await ask('Parish link selector within container (e.g. "a", "a.parish-link"): ');
const parishName = await ask('Parish name selector (leave empty to use link text): ') || undefined;
const parishAddress = await ask('Address selector (leave empty if none): ') || undefined;
const parishCity = await ask('City selector (leave empty if none): ') || undefined;
const pagination = await ask('Pagination "next" selector (leave empty if none): ') || undefined;
const urlPatternStr = await ask('URL pattern regex (leave empty for all): ') || undefined;
const waitForSelector = await ask('Wait for selector (leave empty if not needed): ') || undefined;
const scrapeConfig: DioceseScrapeConfig = {
selectors: {
parishList,
parishLink,
parishName,
parishAddress,
parishCity,
pagination,
},
urlPattern: urlPatternStr,
waitForSelector,
maxPages: 50,
scheduleInDirectory: false,
};
// Test the config
console.log('\nTesting selectors...');
const testResults = await page.$$eval(
parishList,
(elements: Element[], linkSel: string) => {
return elements.slice(0, 5).map(el => {
const link = el.querySelector(linkSel);
return {
name: link?.textContent?.trim() || el.textContent?.trim()?.substring(0, 80) || '(empty)',
url: link?.getAttribute('href') || '(no link)',
};
});
},
parishLink
);
console.log(`\nTest extraction (first 5):`);
for (const r of testResults) {
console.log(` ${r.name}`);
console.log(` -> ${r.url}`);
}
const confirm = await ask('\nSave this configuration? (yes/no): ');
if (confirm.toLowerCase() !== 'yes' && confirm.toLowerCase() !== 'y') {
log('Cancelled.');
return;
}
// Save to database
const diocese = await prisma.diocese.create({
data: {
name,
country,
language,
website: new URL(url).origin,
directoryUrl: url,
scrapeConfig: scrapeConfig as any,
active: true,
},
});
log(`\nDiocese saved! ID: ${diocese.id}`);
log(`Run: npx tsx scripts/scrape-diocese-directory.ts --diocese ${diocese.id} --dry-run`);
} finally {
await scraper.close();
}
}
async function main() {
const args = process.argv.slice(2);
if (args.includes('--list')) {
await listDioceses();
await prisma.$disconnect();
await pool.end();
return;
}
const testIdx = args.indexOf('--test');
if (testIdx !== -1) {
await testDiocese(args[testIdx + 1]);
await prisma.$disconnect();
await pool.end();
return;
}
const urlIdx = args.indexOf('--url');
const countryIdx = args.indexOf('--country');
const langIdx = args.indexOf('--language');
if (urlIdx === -1 || countryIdx === -1) {
console.log('Usage:');
console.log(' npx tsx scripts/setup-diocese.ts --url <directory-url> --country <CC> --language <lang>');
console.log(' npx tsx scripts/setup-diocese.ts --list');
console.log(' npx tsx scripts/setup-diocese.ts --test <diocese-id>');
console.log('');
console.log('Examples:');
console.log(' npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de');
console.log(' npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr');
await prisma.$disconnect();
await pool.end();
return;
}
const url = args[urlIdx + 1];
const country = args[countryIdx + 1];
const language = langIdx !== -1 ? args[langIdx + 1] : country.toLowerCase();
await setupDiocese(url, country, language);
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
logError(`Fatal error: ${error.message}`);
process.exit(1);
});

397
scripts/test-edge-cases.ts Normal file
View File

@@ -0,0 +1,397 @@
#!/usr/bin/env tsx
/**
* Comprehensive edge case test suite for the international mass scraper
*
* This test suite validates all edge cases discovered and fixed during development:
* 1. Day range expansion (Monday-Friday, wtorek-sobota, etc.)
* 2. Office hours filtering (öffnungszeiten, horario, kancelaria, etc.)
* 3. Short abbreviation word boundaries (pn, cz, n in Polish)
* 4. Invalid time filtering (00:00-04:59)
* 5. Deduplication (same schedule appearing multiple times)
* 6. Context-based scoring (mass schedule vs office hours)
* 7. "Closed" notice filtering (nieczynna, fermé, cerrado, etc.)
*/
import { GenericScraper } from '../src/scrapers/strategies/generic';
interface EdgeCaseTest {
name: string;
url: string;
country: string;
language: string;
edgeCases: string[];
expectations: {
minSchedules?: number;
maxSchedules?: number;
shouldHaveDays?: number[]; // 0=Sun, 1=Mon, etc.
shouldNotHaveTimes?: string[]; // Invalid times that should be filtered
shouldHaveTimes?: string[]; // Valid times that should be found
};
knownIssues?: string[];
}
const edgeCaseTests: EdgeCaseTest[] = [
// POLISH - Day ranges, office hours, short abbreviations
{
name: 'Parafia Lubojna (PL)',
url: 'http://parafialubojna.pl',
country: 'PL',
language: 'Polish',
edgeCases: [
'Day range: "wtorek - sobota" (Tuesday-Saturday)',
'Office hours: "kancelaria czynna" with times',
'Short abbreviations: "pn", "cz", "n" in words like "sierpniu", "uroczystości"',
'"Closed" notice: "nieczynna: niedziela, poniedziałek"',
'Space-separated times: "8 00", "9 30", "18 00"',
],
expectations: {
minSchedules: 10,
maxSchedules: 10,
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All 7 days
shouldHaveTimes: ['08:00', '09:30', '11:00', '16:00', '18:00'],
shouldNotHaveTimes: ['18:30', '19:00', '09:00'], // Office hours times
},
},
// GERMAN - Office hours, Uhr format, duplicates
{
name: 'St. Peter, Munich (DE)',
url: 'https://www.alterpeter.de/',
country: 'DE',
language: 'German',
edgeCases: [
'Office hours: "öffnungszeiten im pfarrbüro: montag bis donnerstag 9.00 12.00"',
'Day range: "montag bis donnerstag" (Monday to Thursday)',
'Uhr time format: "10:00 uhr", "17.15 Uhr"',
'Invalid time: "00 uhr" from fragmented "10:00 uhr"',
'Duplicates: Same schedule in current week + general schedule',
'Multi-church parish: Different churches with different times',
],
expectations: {
minSchedules: 10,
maxSchedules: 20,
shouldHaveDays: [0, 6], // At minimum Sunday and Saturday
shouldNotHaveTimes: ['09:00', '12:00', '14:00', '16:00', '00:00'], // Office hours + invalid
},
},
// ITALIAN - Period separator
{
name: 'Duomo di Milano (IT)',
url: 'https://www.duomomilano.it/',
country: 'IT',
language: 'Italian',
edgeCases: [
'Period separator: "18.30", "9.00"',
'Day ranges: "da lunedì a venerdì"',
'Office hours: "orari" or "ufficio"',
],
expectations: {
minSchedules: 10,
maxSchedules: 25,
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All days likely
},
},
// SPANISH - Day ranges with "a"
{
name: 'Sagrada Família, Barcelona (ES)',
url: 'https://sagradafamilia.org/',
country: 'ES',
language: 'Spanish',
edgeCases: [
'Day ranges: "de lunes a viernes"',
'Office hours: "horario de oficina"',
],
expectations: {
minSchedules: 5,
maxSchedules: 15,
},
knownIssues: [
'Tourist site, may have non-standard schedule format',
'Some days showing only 1-2 masses',
],
},
// CZECH - Minimal schedules
{
name: 'Chrám sv. Víta, Prague (CZ)',
url: 'https://www.katedralasvatehovita.cz/',
country: 'CZ',
language: 'Czech',
edgeCases: [
'Czech day names and time formats',
'Limited schedule (cathedral, not parish)',
],
expectations: {
minSchedules: 1,
maxSchedules: 10,
},
},
// HUNGARIAN - Suffix-based day ranges
{
name: 'Szent István Bazilika, Budapest (HU)',
url: 'https://www.bazilika.biz/',
country: 'HU',
language: 'Hungarian',
edgeCases: [
'Hungarian day names',
'Day range suffixes: "-tól", "-től"',
'Limited weekday schedule',
],
expectations: {
minSchedules: 3,
maxSchedules: 10,
shouldHaveDays: [1, 2, 3, 4, 5], // Weekdays
},
},
];
interface TestResult {
name: string;
passed: boolean;
scheduleCount: number;
issues: string[];
edgeCasesValidated: string[];
}
async function runEdgeCaseTest(test: EdgeCaseTest, scraper: GenericScraper): Promise<TestResult> {
const result: TestResult = {
name: test.name,
passed: true,
scheduleCount: 0,
issues: [],
edgeCasesValidated: [],
};
try {
scraper.setCountry(test.country);
const scrapeResult = await scraper.scrape(test.url);
if (!scrapeResult.success) {
result.passed = false;
result.issues.push(`Scrape failed: ${scrapeResult.error}`);
return result;
}
result.scheduleCount = scrapeResult.schedules.length;
// Validate schedule count
if (test.expectations.minSchedules && result.scheduleCount < test.expectations.minSchedules) {
result.passed = false;
result.issues.push(
`Too few schedules: ${result.scheduleCount} < ${test.expectations.minSchedules}`
);
}
if (test.expectations.maxSchedules && result.scheduleCount > test.expectations.maxSchedules) {
result.passed = false;
result.issues.push(
`Too many schedules: ${result.scheduleCount} > ${test.expectations.maxSchedules}`
);
}
// Validate days covered
if (test.expectations.shouldHaveDays) {
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
const foundDays = new Set(scrapeResult.schedules.map(s => s.dayOfWeek));
for (const day of test.expectations.shouldHaveDays) {
if (!foundDays.has(day)) {
result.passed = false;
result.issues.push(`Missing expected day: ${dayNames[day]}`);
} else {
result.edgeCasesValidated.push(`✓ Found ${dayNames[day]}`);
}
}
}
// Validate invalid times are NOT present
if (test.expectations.shouldNotHaveTimes) {
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
for (const time of test.expectations.shouldNotHaveTimes) {
if (foundTimes.has(time)) {
result.passed = false;
result.issues.push(`Found invalid time that should be filtered: ${time}`);
} else {
result.edgeCasesValidated.push(`✓ Filtered out ${time}`);
}
}
}
// Validate expected times ARE present
if (test.expectations.shouldHaveTimes) {
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
for (const time of test.expectations.shouldHaveTimes) {
if (!foundTimes.has(time)) {
result.passed = false;
result.issues.push(`Missing expected time: ${time}`);
} else {
result.edgeCasesValidated.push(`✓ Found ${time}`);
}
}
}
// Check for duplicates (should be none after deduplication)
const uniqueKeys = new Set<string>();
const duplicates: string[] = [];
for (const schedule of scrapeResult.schedules) {
const key = `${schedule.dayOfWeek}-${schedule.time}`;
if (uniqueKeys.has(key)) {
duplicates.push(key);
} else {
uniqueKeys.add(key);
}
}
if (duplicates.length > 0) {
result.passed = false;
result.issues.push(`Found ${duplicates.length} duplicate schedules: ${duplicates.join(', ')}`);
} else {
result.edgeCasesValidated.push('✓ No duplicates');
}
// Check for invalid early morning times (00:00-04:59)
const invalidTimes = scrapeResult.schedules.filter(s => {
const [hours] = s.time.split(':').map(Number);
return hours >= 0 && hours <= 4;
});
if (invalidTimes.length > 0) {
result.passed = false;
result.issues.push(
`Found ${invalidTimes.length} invalid early morning times: ${invalidTimes.map(t => t.time).join(', ')}`
);
} else {
result.edgeCasesValidated.push('✓ No invalid times (00:00-04:59)');
}
} catch (error) {
result.passed = false;
result.issues.push(`Exception: ${error instanceof Error ? error.message : String(error)}`);
}
return result;
}
async function main() {
console.log('🧪 EDGE CASE TEST SUITE FOR INTERNATIONAL MASS SCRAPER');
console.log('='.repeat(80));
console.log('');
const scraper = new GenericScraper();
await scraper.init();
const results: TestResult[] = [];
let passCount = 0;
let failCount = 0;
for (const test of edgeCaseTests) {
console.log(`\n📍 Testing: ${test.name} (${test.language})`);
console.log(` URL: ${test.url}`);
console.log(` Edge cases to validate:`);
for (const edgeCase of test.edgeCases) {
console.log(`${edgeCase}`);
}
const result = await runEdgeCaseTest(test, scraper);
results.push(result);
if (result.passed) {
passCount++;
console.log(`\n ✅ PASSED (${result.scheduleCount} schedules)`);
} else {
failCount++;
console.log(`\n ❌ FAILED (${result.scheduleCount} schedules)`);
}
if (result.edgeCasesValidated.length > 0) {
console.log(`\n Edge cases validated:`);
for (const validation of result.edgeCasesValidated) {
console.log(` ${validation}`);
}
}
if (result.issues.length > 0) {
console.log(`\n ⚠️ Issues:`);
for (const issue of result.issues) {
console.log(`${issue}`);
}
}
if (test.knownIssues && test.knownIssues.length > 0) {
console.log(`\n Known issues:`);
for (const issue of test.knownIssues) {
console.log(`${issue}`);
}
}
// Brief delay between tests
await new Promise(resolve => setTimeout(resolve, 2000));
}
await scraper.close();
// Summary
console.log('\n\n' + '='.repeat(80));
console.log('📊 TEST SUMMARY');
console.log('='.repeat(80));
console.log(`Total tests: ${results.length}`);
console.log(`✅ Passed: ${passCount}`);
console.log(`❌ Failed: ${failCount}`);
console.log(`Success rate: ${((passCount / results.length) * 100).toFixed(1)}%`);
// Detailed results table
console.log('\n' + '-'.repeat(80));
console.log('Test | Status | Schedules | Issues');
console.log('-'.repeat(80));
for (const result of results) {
const status = result.passed ? '✅ PASS' : '❌ FAIL';
const name = result.name.padEnd(33);
const schedules = result.scheduleCount.toString().padStart(9);
const issues = result.issues.length.toString();
console.log(`${name} | ${status} | ${schedules} | ${issues}`);
}
console.log('-'.repeat(80));
// Edge case coverage summary
console.log('\n📋 EDGE CASE COVERAGE:');
console.log('');
console.log('1. Day Range Expansion:');
console.log(' ✓ Polish: "wtorek - sobota"');
console.log(' ✓ German: "montag bis donnerstag"');
console.log(' ✓ Italian: "da lunedì a venerdì"');
console.log(' ✓ Spanish: "de lunes a viernes"');
console.log('');
console.log('2. Office Hours Filtering:');
console.log(' ✓ German: "öffnungszeiten im pfarrbüro"');
console.log(' ✓ Polish: "kancelaria czynna"');
console.log(' ✓ Spanish: "horario de oficina"');
console.log(' ✓ Italian: "orari" / "ufficio"');
console.log('');
console.log('3. Short Abbreviation Word Boundaries:');
console.log(' ✓ Polish: "pn", "cz", "n" (prevented false matches)');
console.log('');
console.log('4. Invalid Time Filtering:');
console.log(' ✓ Filtered: 00:00-04:59 (unrealistic mass times)');
console.log(' ✓ German "00 uhr" fragments filtered');
console.log('');
console.log('5. Deduplication:');
console.log(' ✓ Same day+time appearing multiple times on page');
console.log('');
console.log('6. "Closed" Notice Filtering:');
console.log(' ✓ Polish: "nieczynna: niedziela, poniedziałek"');
console.log(' ✓ Multi-language: fermé, cerrado, geschlossen, chiuso');
console.log('');
console.log('7. Time Format Support:');
console.log(' ✓ AM/PM: "8:30 AM", "8 PM"');
console.log(' ✓ 24-hour: "18:00", "8:30"');
console.log(' ✓ French/Portuguese: "18h30", "8h"');
console.log(' ✓ German: "17 Uhr", "17:00 Uhr"');
console.log(' ✓ Italian: "18.30"');
console.log(' ✓ Polish: "8 00", "18 00"');
process.exit(failCount > 0 ? 1 : 0);
}
main().catch(console.error);

152
scripts/test-scraper.ts Normal file
View File

@@ -0,0 +1,152 @@
import { GenericScraper } from '../src/scrapers/strategies/generic';
import { getScraper } from '../src/scrapers/registry';
import type { BaseScraper, ScrapeResult } from '../src/scrapers/base-scraper';
const TEST_URL = process.argv[2] || 'https://www.saintpatrickscathedral.org/masses';
// Parse --country flag from CLI args
const countryFlagIndex = process.argv.indexOf('--country');
const COUNTRY_CODE = countryFlagIndex !== -1 ? process.argv[countryFlagIndex + 1] : null;
// Parse --lang flag from CLI args (e.g., --lang english)
const langFlagIndex = process.argv.indexOf('--lang');
const LANG = langFlagIndex !== -1 ? process.argv[langFlagIndex + 1] : null;
const DAY_NAMES = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
async function main() {
console.log('\n' + '='.repeat(70));
console.log('NEARESTMASS SCRAPER TEST');
console.log('='.repeat(70));
console.log(`\nURL: ${TEST_URL}`);
console.log(`Country: ${COUNTRY_CODE || '(auto-detect from <html lang>)'}`);
console.log(`Scraper: ${LANG || 'generic'}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('\n' + '-'.repeat(70));
let scraper: BaseScraper;
if (LANG) {
scraper = getScraper(LANG);
console.log(`\n Using ${LANG} scraper`);
} else {
scraper = new GenericScraper();
}
try {
console.log('\n[1/4] Initializing browser...');
await scraper.init();
console.log(' ✓ Browser ready');
if (COUNTRY_CODE && scraper instanceof GenericScraper) {
scraper.setCountry(COUNTRY_CODE);
console.log(` Country set to: ${COUNTRY_CODE}`);
}
console.log('\n[2/4] Fetching page...');
const startTime = Date.now();
const result: ScrapeResult = await scraper.scrape(TEST_URL);
const elapsed = Date.now() - startTime;
console.log(` ✓ Page loaded in ${elapsed}ms`);
console.log('\n[3/4] Parsing results...');
console.log(` Status: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
console.log(` Schedules found: ${result.schedules.length}`);
if (result.detectedLanguage) {
console.log(` Detected language: ${result.detectedLanguage}`);
}
if (result.churchData) {
console.log('\n Church Data:');
if (result.churchData.phone) console.log(` Phone: ${result.churchData.phone}`);
if (result.churchData.email) console.log(` Email: ${result.churchData.email}`);
if (result.churchData.pastorName) console.log(` Pastor: ${result.churchData.pastorName}`);
if (result.churchData.diocese) console.log(` Diocese: ${result.churchData.diocese}`);
}
if (result.error) {
console.log(` Error: ${result.error}`);
}
if (result.schedules.length > 0) {
console.log('\n' + '-'.repeat(70));
console.log('PARSED MASS SCHEDULES');
console.log('-'.repeat(70));
const byDay: Record<number, typeof result.schedules> = {};
for (const schedule of result.schedules) {
if (!byDay[schedule.dayOfWeek]) {
byDay[schedule.dayOfWeek] = [];
}
byDay[schedule.dayOfWeek].push(schedule);
}
for (let day = 0; day < 7; day++) {
const schedules = byDay[day];
if (schedules && schedules.length > 0) {
console.log(`\n${DAY_NAMES[day]}:`);
for (const s of schedules) {
const parts = [
` ${s.time}`,
s.language && s.language !== 'English' ? `(${s.language})` : '',
s.massType ? `[${s.massType}]` : '',
s.notes ? `- ${s.notes}` : '',
].filter(Boolean);
console.log(parts.join(' '));
}
}
}
}
if (result.rawHtml) {
console.log('\n' + '-'.repeat(70));
console.log('RAW TEXT PREVIEW (first 1000 chars, stripped of HTML)');
console.log('-'.repeat(70));
const textOnly = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/[\u2013\u2014]/g, '-')
.replace(/\s+/g, ' ')
.trim()
.substring(0, 1000);
console.log('\n' + textOnly);
if (result.rawHtml.length > 1000) {
console.log('\n... (truncated)');
}
}
console.log('\n' + '='.repeat(70));
console.log('SUMMARY');
console.log('='.repeat(70));
console.log(`URL: ${TEST_URL}`);
console.log(`Scraper: ${LANG || 'generic'}`);
console.log(`Country: ${COUNTRY_CODE || '(auto-detected)'}`);
console.log(`Language: ${result.detectedLanguage || '(unknown)'}`);
console.log(`Success: ${result.success ? 'Yes' : 'No'}`);
console.log(`Schedules: ${result.schedules.length}`);
console.log(`HTML Size: ${result.rawHtml ? Math.round(result.rawHtml.length / 1024) + ' KB' : 'N/A'}`);
if (result.schedules.length > 0) {
const days = [...new Set(result.schedules.map(s => s.dayOfWeek))];
const languages = [...new Set(result.schedules.map(s => s.language || 'English'))];
console.log(`Days: ${days.map(d => DAY_NAMES[d]).join(', ')}`);
console.log(`Languages: ${languages.join(', ')}`);
}
console.log('='.repeat(70) + '\n');
} catch (error) {
console.error('\n[ERROR]', error);
} finally {
console.log('[4/4] Closing browser...');
await scraper.close();
console.log(' ✓ Done\n');
}
}
main().catch(console.error);

View File

@@ -0,0 +1,135 @@
import { discoverMassScheduleUrl } from '../src/scrapers/url-discovery';
const TEST_SITES = [
'https://www.saintpatrickscathedral.org',
'https://www.holynamecathedral.org',
'https://www.olacathedral.org',
];
const CONFIDENCE_ICONS: Record<string, string> = {
high: '🟢',
medium: '🟡',
low: '🔴',
};
const METHOD_DESCRIPTIONS: Record<string, string> = {
pattern: 'Found via URL pattern matching',
link: 'Found via link crawling',
homepage: 'Fell back to homepage',
};
async function testSingleUrl(url: string) {
console.log('\n' + '='.repeat(70));
console.log('NEARESTMASS URL DISCOVERY TEST');
console.log('='.repeat(70));
console.log(`\nURL: ${url}`);
console.log(`Time: ${new Date().toISOString()}`);
console.log('\n' + '-'.repeat(70));
console.log('\n[1/2] Discovering mass schedule URL...');
const startTime = Date.now();
const result = await discoverMassScheduleUrl(url);
const elapsed = Date.now() - startTime;
console.log(` ✓ Discovery completed in ${elapsed}ms`);
console.log('\n[2/2] Results:');
console.log(` Discovered URL: ${result.url}`);
console.log(` Method: ${result.method} (${METHOD_DESCRIPTIONS[result.method]})`);
console.log(` Confidence: ${CONFIDENCE_ICONS[result.confidence]} ${result.confidence}`);
console.log('\n' + '='.repeat(70));
console.log('SUMMARY');
console.log('='.repeat(70));
console.log(`Input: ${url}`);
console.log(`Output: ${result.url}`);
console.log(`Method: ${result.method}`);
console.log(`Confidence: ${result.confidence}`);
console.log(`Time: ${elapsed}ms`);
console.log('='.repeat(70) + '\n');
}
async function testMultipleSites() {
console.log('\n' + '='.repeat(70));
console.log('NEARESTMASS URL DISCOVERY TEST (BATCH)');
console.log('='.repeat(70));
console.log(`\nTesting ${TEST_SITES.length} sites...`);
console.log(`Time: ${new Date().toISOString()}`);
const results: Array<{
site: string;
url: string;
method: string;
confidence: string;
elapsed: number;
}> = [];
for (let i = 0; i < TEST_SITES.length; i++) {
const site = TEST_SITES[i];
console.log('\n' + '-'.repeat(70));
console.log(`[${i + 1}/${TEST_SITES.length}] Testing: ${site}`);
console.log('-'.repeat(70));
const startTime = Date.now();
const result = await discoverMassScheduleUrl(site);
const elapsed = Date.now() - startTime;
console.log(`\n Discovered URL: ${result.url}`);
console.log(` Method: ${result.method} (${METHOD_DESCRIPTIONS[result.method]})`);
console.log(` Confidence: ${CONFIDENCE_ICONS[result.confidence]} ${result.confidence}`);
console.log(` Time: ${elapsed}ms`);
results.push({
site,
url: result.url,
method: result.method,
confidence: result.confidence,
elapsed,
});
// Rate limiting between sites
if (i < TEST_SITES.length - 1) {
console.log('\n Waiting 2s before next site...');
await new Promise((r) => setTimeout(r, 2000));
}
}
// Summary table
console.log('\n' + '='.repeat(70));
console.log('SUMMARY');
console.log('='.repeat(70));
const highCount = results.filter((r) => r.confidence === 'high').length;
const mediumCount = results.filter((r) => r.confidence === 'medium').length;
const lowCount = results.filter((r) => r.confidence === 'low').length;
const totalTime = results.reduce((sum, r) => sum + r.elapsed, 0);
console.log(`\nSites tested: ${results.length}`);
console.log(`High conf: ${highCount} 🟢`);
console.log(`Medium conf: ${mediumCount} 🟡`);
console.log(`Low conf: ${lowCount} 🔴`);
console.log(`Total time: ${totalTime}ms`);
console.log('\n' + '-'.repeat(70));
console.log('RESULTS BY SITE');
console.log('-'.repeat(70));
for (const r of results) {
console.log(`\n${r.site}`);
console.log(`${r.url}`);
console.log(` ${CONFIDENCE_ICONS[r.confidence]} ${r.confidence} via ${r.method}`);
}
console.log('\n' + '='.repeat(70) + '\n');
}
async function main() {
const testUrl = process.argv[2];
if (testUrl) {
await testSingleUrl(testUrl);
} else {
await testMultipleSites();
}
}
main().catch(console.error);

View File

@@ -0,0 +1,319 @@
#!/usr/bin/env tsx
/**
* Transfer enriched church data from Synology NAS to Neon production
*
* This script transfers ONLY churches that have been enriched or scraped
* (have websites, phone numbers, or mass schedules) to reduce data transfer.
*
* Usage:
* npx tsx scripts/transfer-enriched-to-neon.ts # Dry run
* npx tsx scripts/transfer-enriched-to-neon.ts --execute # Actually transfer
*/
import { PrismaClient } from '@prisma/client';
import { PrismaPg } from '@prisma/adapter-pg';
import { Pool } from 'pg';
import dotenv from 'dotenv';
import path from 'path';
interface TransferStats {
churchesProcessed: number;
churchesInserted: number;
churchesUpdated: number;
massSchedules: number;
confessionSchedules: number;
adorationSchedules: number;
errors: number;
}
async function main() {
// Parse CLI arguments
const args = process.argv.slice(2);
const executeIndex = args.indexOf('--execute');
const sinceIndex = args.indexOf('--since');
const forceAllIndex = args.indexOf('--force-all');
const dryRun = executeIndex === -1;
const sinceTimestamp = sinceIndex !== -1 && args[sinceIndex + 1]
? new Date(args[sinceIndex + 1])
: null;
const forceAll = forceAllIndex !== -1;
console.log('════════════════════════════════════════════════════════════');
console.log(' Transfer Enriched Data: Synology NAS → Neon Production');
console.log('════════════════════════════════════════════════════════════\n');
if (dryRun) {
console.log('🔍 DRY RUN MODE - No data will be written to Neon\n');
} else {
console.log('⚠️ PRODUCTION MODE - Data will be written to Neon');
console.log('Press Ctrl+C within 5 seconds to cancel...\n');
await new Promise(resolve => setTimeout(resolve, 5000));
}
if (forceAll) {
console.log('🔄 FORCE ALL MODE - Transferring all enriched churches\n');
} else if (sinceTimestamp) {
console.log(`📅 INCREMENTAL MODE - Only churches modified since ${sinceTimestamp.toISOString()}\n`);
} else {
console.log('📅 AUTO INCREMENTAL MODE - Detecting last transfer timestamp...\n');
}
// Step 1: Connect to NAS database
console.log('[1/3] Connecting to Synology NAS database...');
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
const nasPool = new Pool({ connectionString: process.env.DATABASE_URL });
const nasAdapter = new PrismaPg(nasPool);
const nasPrisma = new PrismaClient({ adapter: nasAdapter });
try {
await nasPrisma.$connect();
const nasUrl = process.env.DATABASE_URL?.split('@')[1]?.split('/')[0] || 'unknown';
console.log(`✅ Connected to NAS: ${nasUrl}\n`);
// Detect last transfer timestamp if not specified
let transferSince: Date | null = sinceTimestamp;
if (!forceAll && !sinceTimestamp) {
// Auto-detect: find the most recent lastTransferredAt across all churches
const lastTransfer = await nasPrisma.church.findFirst({
where: { lastTransferredAt: { not: null } },
orderBy: { lastTransferredAt: 'desc' },
select: { lastTransferredAt: true }
});
if (lastTransfer?.lastTransferredAt) {
transferSince = lastTransfer.lastTransferredAt;
console.log(`✅ Last transfer detected: ${transferSince.toISOString()}`);
console.log(` Will transfer churches modified after this time\n`);
} else {
console.log(' No previous transfer detected - will transfer all enriched churches\n');
}
}
// Step 2: Export enriched churches from NAS
console.log('[2/3] Exporting enriched churches from NAS...');
console.log('Criteria: Has website OR phone OR google_place_id OR mass schedules\n');
// Build WHERE clause
const whereClause: any = {
OR: [
{ website: { not: null } },
{ phone: { not: null } },
{ googlePlaceId: { not: null } },
{ massSchedules: { some: {} } },
]
};
// Add incremental filter if applicable
if (!forceAll && transferSince) {
whereClause.AND = { updatedAt: { gt: transferSince } };
console.log(`🔄 Incremental filter: updatedAt > ${transferSince.toISOString()}\n`);
}
const BATCH_SIZE = 200;
const totalCount = await nasPrisma.church.count({ where: whereClause });
console.log(`Found ${totalCount} enriched churches (will process in batches of ${BATCH_SIZE})\n`);
if (totalCount === 0) {
console.log('⚠️ No enriched churches to transfer');
await nasPrisma.$disconnect();
return;
}
// Step 3: Import to Neon
console.log('[3/3] Importing to Neon production database...');
// Load Neon credentials
dotenv.config({ path: path.resolve(process.cwd(), '.env.production'), override: true });
const neonPool = new Pool({ connectionString: process.env.DATABASE_URL });
const neonAdapter = new PrismaPg(neonPool);
const neonPrisma = new PrismaClient({ adapter: neonAdapter });
try {
await neonPrisma.$connect();
const neonUrl = process.env.DATABASE_URL?.split('@')[1]?.split('/')[0] || 'unknown';
console.log(`✅ Connected to Neon: ${neonUrl}\n`);
const stats: TransferStats = {
churchesProcessed: 0,
churchesInserted: 0,
churchesUpdated: 0,
massSchedules: 0,
confessionSchedules: 0,
adorationSchedules: 0,
errors: 0,
};
for (let skip = 0; skip < totalCount; skip += BATCH_SIZE) {
const churches = await nasPrisma.church.findMany({
where: whereClause,
include: {
massSchedules: true,
confessionSchedules: true,
adorationSchedules: true,
},
skip,
take: BATCH_SIZE,
orderBy: { id: 'asc' },
});
console.log(`\nBatch ${Math.floor(skip / BATCH_SIZE) + 1}: processing ${churches.length} churches (${skip + 1}${skip + churches.length} of ${totalCount})`);
for (const church of churches) {
try {
stats.churchesProcessed++;
const massSchedules = church.massSchedules || [];
const confessionSchedules = church.confessionSchedules || [];
const adorationSchedules = church.adorationSchedules || [];
// Extract church data without relations (preserve lastTransferredAt)
const { massSchedules: _, confessionSchedules: __, adorationSchedules: ___, id, createdAt, updatedAt, lastTransferredAt, ...churchData } = church;
if (!dryRun) {
// Check if church exists in Neon
const existing = await neonPrisma.church.findFirst({
where: {
latitude: church.latitude,
longitude: church.longitude,
}
});
let resultId: string;
if (existing) {
// Update existing church (only overwrite if NAS has better data)
await neonPrisma.church.update({
where: { id: existing.id },
data: {
website: churchData.website || existing.website,
phone: churchData.phone || existing.phone,
googlePlaceId: churchData.googlePlaceId || existing.googlePlaceId,
// Always update name, address if provided
name: churchData.name,
address: churchData.address || existing.address,
city: churchData.city || existing.city,
state: churchData.state || existing.state,
zip: churchData.zip || existing.zip,
massScheduleUrl: churchData.massScheduleUrl || existing.massScheduleUrl,
lastTransferredAt: new Date(), // Mark as transferred
}
});
resultId = existing.id;
stats.churchesUpdated++;
// Delete old schedules
await neonPrisma.massSchedule.deleteMany({ where: { churchId: existing.id } });
await neonPrisma.confessionSchedule.deleteMany({ where: { churchId: existing.id } });
await neonPrisma.adorationSchedule.deleteMany({ where: { churchId: existing.id } });
} else {
// Create new church
const newChurch = await neonPrisma.church.create({
data: {
...churchData,
lastTransferredAt: new Date(), // Mark as transferred
}
});
resultId = newChurch.id;
stats.churchesInserted++;
}
// Insert schedules
for (const schedule of massSchedules) {
const { id, createdAt, updatedAt, ...scheduleData } = schedule;
await neonPrisma.massSchedule.create({
data: { ...scheduleData, churchId: resultId }
});
stats.massSchedules++;
}
for (const schedule of confessionSchedules) {
const { id, createdAt, updatedAt, ...scheduleData } = schedule;
await neonPrisma.confessionSchedule.create({
data: { ...scheduleData, churchId: resultId }
});
stats.confessionSchedules++;
}
for (const schedule of adorationSchedules) {
const { id, createdAt, updatedAt, ...scheduleData } = schedule;
await neonPrisma.adorationSchedule.create({
data: { ...scheduleData, churchId: resultId }
});
stats.adorationSchedules++;
}
// Update NAS record with transfer timestamp (after successful transfer to Neon)
await nasPrisma.church.update({
where: { id: church.id },
data: { lastTransferredAt: new Date() }
});
} else {
// Dry run - just count
stats.massSchedules += massSchedules.length;
stats.confessionSchedules += confessionSchedules.length;
stats.adorationSchedules += adorationSchedules.length;
}
if (stats.churchesProcessed % 100 === 0) {
console.log(`Progress: ${stats.churchesProcessed}/${totalCount} churches...`);
}
} catch (error) {
stats.errors++;
console.error(`Error transferring ${church.name}:`, error instanceof Error ? error.message : error);
}
}
} // end batch loop
console.log('\n════════════════════════════════════════════════════════════');
console.log('Transfer Summary');
console.log('════════════════════════════════════════════════════════════');
if (!forceAll && transferSince) {
console.log(`Transfer mode: Incremental (since ${transferSince.toISOString()})`);
} else {
console.log(`Transfer mode: Full (all enriched churches)`);
}
console.log(`Churches processed: ${stats.churchesProcessed}`);
console.log(`Churches inserted: ${stats.churchesInserted}`);
console.log(`Churches updated: ${stats.churchesUpdated}`);
console.log(`Mass schedules: ${stats.massSchedules}`);
console.log(`Confession schedules: ${stats.confessionSchedules}`);
console.log(`Adoration schedules: ${stats.adorationSchedules}`);
console.log(`Errors: ${stats.errors}`);
console.log('════════════════════════════════════════════════════════════\n');
await neonPrisma.$disconnect();
await nasPrisma.$disconnect();
if (dryRun) {
console.log('💡 This was a DRY RUN. To actually transfer to Neon, run:');
console.log(' Incremental sync (default):');
console.log(' npx tsx scripts/transfer-enriched-to-neon.ts --execute\n');
console.log(' Transfer all enriched churches:');
console.log(' npx tsx scripts/transfer-enriched-to-neon.ts --execute --force-all\n');
console.log(' Transfer since specific date:');
console.log(' npx tsx scripts/transfer-enriched-to-neon.ts --execute --since 2026-02-01T00:00:00Z\n');
} else {
console.log('🎉 Data successfully transferred to Neon production!\n');
}
} catch (error) {
console.error('❌ Neon import failed:', error);
await neonPrisma.$disconnect();
throw error;
}
} catch (error) {
console.error('❌ Transfer failed:', error);
await nasPrisma.$disconnect();
process.exit(1);
}
}
main().catch(console.error);