chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Albert
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions

View File

@@ -0,0 +1,165 @@
import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface CountryStats {
country: string;
totalChurches: number;
withWebsite: number;
withoutWebsite: number;
websitePercent: number;
needEnrichment: number;
priority: number;
}
async function analyzeEnrichmentPriority() {
try {
console.log('Analyzing enrichment priority by country...\n');
// Get all OSM churches grouped by country
const churches = await prisma.church.findMany({
where: {
source: 'osm',
},
select: {
country: true,
hasWebsite: true,
website: true,
},
});
// Group by country and calculate stats
const byCountry = churches.reduce((acc, church) => {
const country = church.country || 'Unknown';
if (!acc[country]) {
acc[country] = {
country,
totalChurches: 0,
withWebsite: 0,
withoutWebsite: 0,
websitePercent: 0,
needEnrichment: 0,
priority: 0,
};
}
acc[country].totalChurches++;
if (church.hasWebsite || church.website) {
acc[country].withWebsite++;
} else {
acc[country].withoutWebsite++;
acc[country].needEnrichment++;
}
return acc;
}, {} as Record<string, CountryStats>);
// Calculate percentages and priority score
const stats = Object.values(byCountry).map((stat) => {
stat.websitePercent = (stat.withWebsite / stat.totalChurches) * 100;
// Priority formula:
// - Weight heavily on churches needing enrichment (80%)
// - Weight on low website coverage (20%)
// This favors large countries with low coverage
const needWeight = stat.needEnrichment / 1000; // Normalize to thousands
const coverageGap = 100 - stat.websitePercent; // How much coverage is missing
stat.priority = needWeight * 0.8 + (coverageGap / 100) * needWeight * 0.2;
return stat;
});
// Sort by priority (highest first)
stats.sort((a, b) => b.priority - a.priority);
// Display results
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('ENRICHMENT PRIORITY RANKING');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
console.log('Priority formula: (churches_needing_enrichment * 0.8) + (coverage_gap * 0.2)');
console.log('This favors countries with many churches and low website coverage.');
console.log('');
console.log('Rank | Country | Total | Need Enrichment | Coverage | Priority Score');
console.log('─────┼─────────┼───────┼────────────────┼──────────┼────────────────');
stats.forEach((stat, index) => {
const rank = String(index + 1).padStart(4);
const country = stat.country.padEnd(7);
const total = String(stat.totalChurches).padStart(5);
const need = String(stat.needEnrichment).padStart(15);
const coverage = `${stat.websitePercent.toFixed(1)}%`.padStart(8);
const priority = stat.priority.toFixed(2).padStart(14);
console.log(`${rank} | ${country} | ${total} | ${need} | ${coverage} | ${priority}`);
});
console.log('');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
// Show top 10 with details
console.log('TOP 10 COUNTRIES TO PRIORITIZE:');
console.log('');
stats.slice(0, 10).forEach((stat, index) => {
console.log(`${index + 1}. ${stat.country}`);
console.log(` Total churches: ${stat.totalChurches.toLocaleString()}`);
console.log(` Need enrichment: ${stat.needEnrichment.toLocaleString()} (${(100 - stat.websitePercent).toFixed(1)}% missing)`);
console.log(` Current coverage: ${stat.websitePercent.toFixed(1)}%`);
console.log(` Priority score: ${stat.priority.toFixed(2)}`);
console.log('');
});
// Calculate enrichment timeline
const totalNeedEnrichment = stats.reduce((sum, s) => sum + s.needEnrichment, 0);
const daysAtFullSpeed = Math.ceil(totalNeedEnrichment / 390);
const monthsAtFullSpeed = (daysAtFullSpeed / 30).toFixed(1);
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('ENRICHMENT TIMELINE');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log(`Total churches needing enrichment: ${totalNeedEnrichment.toLocaleString()}`);
console.log(`At 390 churches/day (free tier): ${daysAtFullSpeed} days (~${monthsAtFullSpeed} months)`);
console.log('');
// Output country priority order for the script
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('COUNTRY PRIORITY ORDER (for enrichment script)');
console.log('═══════════════════════════════════════════════════════════════════════════');
console.log('');
console.log('const COUNTRY_PRIORITY = [');
stats
.filter((s) => s.needEnrichment > 0)
.forEach((stat, index) => {
const comma = index < stats.filter((s) => s.needEnrichment > 0).length - 1 ? ',' : '';
console.log(` '${stat.country}'${comma} // ${stat.needEnrichment.toLocaleString()} churches`);
});
console.log('];');
console.log('');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
analyzeEnrichmentPriority();

View File

@@ -0,0 +1,66 @@
#!/usr/bin/env tsx
/**
* Check the 2 potentially real bugs
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function checkRealBugs() {
const scraper = new GenericScraper();
await scraper.init();
console.log('=== 1. Iglesia de San Fernando (trying Spanish page) ===\n');
scraper.setCountry('ES');
const spanishUrl = 'https://www.parroquiasanfernandomaspalomas.net/'; // Remove /de/
const result1 = await scraper.scrape(spanishUrl);
console.log(`URL: ${spanishUrl}`);
console.log(`Success: ${result1.success}`);
console.log(`Schedules: ${result1.schedules.length}`);
console.log(`Error: ${result1.error || 'none'}\n`);
if (result1.schedules.length > 0) {
console.log('Sample schedules:');
result1.schedules.slice(0, 5).forEach(s => {
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
});
}
console.log('\n=== 2. Kościół (Poland) ===\n');
scraper.setCountry('PL');
const result2 = await scraper.scrape('http://parafialubojna.pl');
console.log(`Success: ${result2.success}`);
console.log(`Schedules: ${result2.schedules.length}`);
console.log(`Error: ${result2.error || 'none'}\n`);
if (result2.schedules.length > 0) {
console.log('Sample schedules:');
result2.schedules.slice(0, 5).forEach(s => {
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
});
} else if (result2.rawHtml) {
const text = result2.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Look for Polish schedule keywords
const scheduleIndex = text.indexOf('msze') || text.indexOf('msza') || text.indexOf('nabożeńst');
if (scheduleIndex !== -1) {
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
console.log('Found schedule section:');
console.log(snippet);
}
}
await scraper.close();
}
checkRealBugs().catch(console.error);

View File

@@ -0,0 +1,79 @@
import { Pool } from 'pg';
import * as dotenv from 'dotenv';
import * as path from 'path';
// Load .env.local first (takes precedence), then .env
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
async function checkEnrichmentDetail() {
try {
console.log('Connecting to database...\n');
// Check churches awaiting enrichment
const pendingResult = await pool.query(`
SELECT
country,
COUNT(*) as pending_count
FROM churches
WHERE google_place_id IS NULL
GROUP BY country
ORDER BY pending_count DESC
LIMIT 20;
`);
console.log('=== Churches Awaiting Enrichment (Top 20 Countries) ===');
let totalPending = 0;
pendingResult.rows.forEach((row) => {
console.log(`${row.country}: ${row.pending_count} churches`);
totalPending += parseInt(row.pending_count);
});
console.log(`\nTotal pending shown: ${totalPending}`);
// Check total stats
const statsResult = await pool.query(`
SELECT
COUNT(*) as total_churches,
COUNT(CASE WHEN google_place_id IS NOT NULL THEN 1 END) as enriched,
COUNT(CASE WHEN google_place_id IS NULL THEN 1 END) as pending
FROM churches;
`);
console.log('\n=== Overall Stats ===');
console.log(`Total churches: ${statsResult.rows[0].total_churches}`);
console.log(`Enriched: ${statsResult.rows[0].enriched} (${((statsResult.rows[0].enriched / statsResult.rows[0].total_churches) * 100).toFixed(2)}%)`);
console.log(`Pending: ${statsResult.rows[0].pending} (${((statsResult.rows[0].pending / statsResult.rows[0].total_churches) * 100).toFixed(2)}%)`);
// Check enrichment rate
const rateResult = await pool.query(`
SELECT
DATE(updated_at) as date,
COUNT(*) as enriched_count
FROM churches
WHERE google_place_id IS NOT NULL
AND updated_at > NOW() - INTERVAL '7 days'
GROUP BY DATE(updated_at)
ORDER BY date DESC;
`);
console.log('\n=== Enrichment Activity (Last 7 Days) ===');
if (rateResult.rows.length === 0) {
console.log('No enrichment activity in the last 7 days');
} else {
rateResult.rows.forEach((row) => {
console.log(`${row.date}: ${row.enriched_count} churches`);
});
}
} catch (error) {
console.error('Error checking enrichment detail:', error);
} finally {
await pool.end();
}
}
checkEnrichmentDetail();

View File

@@ -0,0 +1,146 @@
import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function checkEnrichmentStatus() {
try {
console.log('Checking enrichment status...\n');
// Overall stats
const totalOSM = await prisma.church.count({
where: { source: 'osm' },
});
const enriched = await prisma.church.count({
where: {
source: 'osm',
googlePlaceId: { not: null },
},
});
const withWebsite = await prisma.church.count({
where: {
source: 'osm',
hasWebsite: true,
},
});
const needEnrichment = await prisma.church.count({
where: {
source: 'osm',
hasWebsite: false,
website: null,
},
});
// Recently enriched (last 24 hours)
const yesterday = new Date();
yesterday.setDate(yesterday.getDate() - 1);
const recentlyEnriched = await prisma.church.count({
where: {
source: 'osm',
googlePlaceId: { not: null },
updatedAt: { gte: yesterday },
},
});
// Get top 10 priority countries status
const PRIORITY_COUNTRIES = ['FR', 'DE', 'ES', 'PL', 'BR', 'PT', 'PH', 'CZ', 'MX', 'HU'];
console.log('═══════════════════════════════════════════════════════════════');
console.log('OVERALL ENRICHMENT STATUS');
console.log('═══════════════════════════════════════════════════════════════');
console.log(`Total OSM churches: ${totalOSM.toLocaleString()}`);
console.log(`Churches with Google Place ID: ${enriched.toLocaleString()} (${((enriched / totalOSM) * 100).toFixed(2)}%)`);
console.log(`Churches with websites: ${withWebsite.toLocaleString()} (${((withWebsite / totalOSM) * 100).toFixed(2)}%)`);
console.log(`Need enrichment: ${needEnrichment.toLocaleString()} (${((needEnrichment / totalOSM) * 100).toFixed(2)}%)`);
console.log('');
console.log(`Recently enriched (24h): ${recentlyEnriched.toLocaleString()}`);
console.log('');
// Priority countries breakdown
console.log('═══════════════════════════════════════════════════════════════');
console.log('TOP 10 PRIORITY COUNTRIES STATUS');
console.log('═══════════════════════════════════════════════════════════════');
console.log('');
for (const country of PRIORITY_COUNTRIES) {
const total = await prisma.church.count({
where: { source: 'osm', country },
});
const countryEnriched = await prisma.church.count({
where: {
source: 'osm',
country,
googlePlaceId: { not: null },
},
});
const countryWithWebsite = await prisma.church.count({
where: {
source: 'osm',
country,
OR: [
{ hasWebsite: true },
{ googlePlaceId: { not: null } },
],
},
});
const countryNeedEnrichment = await prisma.church.count({
where: {
source: 'osm',
country,
hasWebsite: false,
website: null,
},
});
const websitePercent = (countryWithWebsite / total) * 100;
const enrichedPercent = (countryEnriched / total) * 100;
console.log(`${country.padEnd(4)} | Total: ${String(total).padStart(6)} | Enriched: ${String(countryEnriched).padStart(5)} (${enrichedPercent.toFixed(1)}%) | With Website: ${String(countryWithWebsite).padStart(5)} (${websitePercent.toFixed(1)}%) | Need: ${String(countryNeedEnrichment).padStart(6)}`);
}
console.log('');
// Estimate timeline
const daysRemaining = Math.ceil(needEnrichment / 390);
const monthsRemaining = (daysRemaining / 30).toFixed(1);
console.log('═══════════════════════════════════════════════════════════════');
console.log('TIMELINE ESTIMATE');
console.log('═══════════════════════════════════════════════════════════════');
console.log(`At 390 churches/day:`);
console.log(` Days remaining: ${daysRemaining} days`);
console.log(` Months remaining: ~${monthsRemaining} months`);
console.log(` Estimated completion: ${new Date(Date.now() + daysRemaining * 24 * 60 * 60 * 1000).toLocaleDateString()}`);
console.log('');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
checkEnrichmentStatus();

View File

@@ -0,0 +1,78 @@
import { Pool } from 'pg';
import * as dotenv from 'dotenv';
import * as path from 'path';
// Load .env.local first (takes precedence), then .env
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
async function checkEnrichment() {
try {
console.log('Connecting to database...');
// Check total enriched churches
const totalResult = await pool.query(`
SELECT
COUNT(*) as total_enriched,
COUNT(CASE WHEN updated_at > NOW() - INTERVAL '24 hours' THEN 1 END) as enriched_last_24h,
MAX(updated_at) as last_enrichment
FROM churches
WHERE google_place_id IS NOT NULL;
`);
console.log('\n=== Google Enrichment Summary ===');
console.log(`Total churches with Google Place ID: ${totalResult.rows[0].total_enriched}`);
console.log(`Enriched in last 24 hours: ${totalResult.rows[0].enriched_last_24h}`);
console.log(`Last enrichment: ${totalResult.rows[0].last_enrichment}`);
// Check by country
const countryResult = await pool.query(`
SELECT
country,
COUNT(*) as enriched_count,
COUNT(CASE WHEN updated_at > NOW() - INTERVAL '24 hours' THEN 1 END) as enriched_last_24h
FROM churches
WHERE google_place_id IS NOT NULL
GROUP BY country
ORDER BY enriched_last_24h DESC
LIMIT 10;
`);
console.log('\n=== Top Countries Enriched (Last 24h) ===');
countryResult.rows.forEach((row) => {
console.log(`${row.country}: ${row.enriched_last_24h} new / ${row.enriched_count} total`);
});
// Check recent enrichments with details
const recentResult = await pool.query(`
SELECT
name,
city,
country,
google_place_id,
updated_at
FROM churches
WHERE google_place_id IS NOT NULL
AND updated_at > NOW() - INTERVAL '24 hours'
ORDER BY updated_at DESC
LIMIT 20;
`);
console.log('\n=== Recent Enrichments (Last 24h, sample) ===');
recentResult.rows.forEach((row) => {
const timestamp = row.updated_at ? new Date(row.updated_at).toISOString() : 'unknown';
console.log(`${row.name}, ${row.city}, ${row.country} - ${timestamp}`);
});
} catch (error) {
console.error('Error checking enrichment:', error);
} finally {
await pool.end();
}
}
checkEnrichment();

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env tsx
/**
* Check the full section text for German church to understand office hours pattern
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function checkGerman() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find Monday section
const montagIndex = text.indexOf('montag');
if (montagIndex !== -1) {
const montagContext = text.substring(montagIndex, montagIndex + 200);
console.log('=== Monday (Montag) context ===');
console.log(montagContext);
console.log('');
}
// Find Sunday section
const sonntagIndex = text.indexOf('sonntag');
if (sonntagIndex !== -1) {
const sonntagContext = text.substring(sonntagIndex, sonntagIndex + 300);
console.log('=== Sunday (Sonntag) context ===');
console.log(sonntagContext);
console.log('');
}
}
await scraper.close();
}
checkGerman().catch(console.error);

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env tsx
import { config } from 'dotenv';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
// Load environment variables
config({ path: '.env.local' });
config({ path: '.env' });
async function main() {
const connectionString = process.env.DATABASE_URL || '';
console.log('DATABASE_URL:', connectionString.replace(/:[^:@]+@/, ':****@'));
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
console.log('PrismaClient created:', !!prisma);
console.log('prisma.churches:', !!prisma.churches);
await prisma.$connect();
const count = await prisma.churches.count({ where: { country: 'PL' } });
console.log(`Poland churches in Neon: ${count}`);
const withSchedules = await prisma.churches.count({
where: {
country: 'PL',
massSchedules: { some: {} }
}
});
console.log(`With mass schedules: ${withSchedules}`);
// Sample a few churches
const sample = await prisma.churches.findMany({
where: { country: 'PL' },
include: { massSchedules: true },
take: 3
});
console.log('\nSample churches:');
for (const church of sample) {
console.log(` - ${church.name} (${church.city}): ${church.massSchedules.length} schedules`);
}
await prisma.$disconnect();
await pool.end();
}
main().catch(console.error);

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function check() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('PL');
const result = await scraper.scrape('http://parafialubojna.pl');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
const niedziela_matches = [];
let idx = 0;
while ((idx = text.indexOf('niedziela', idx)) !== -1) {
niedziela_matches.push({
position: idx,
context: text.substring(Math.max(0, idx-30), idx+70)
});
idx++;
}
console.log(`niedziela occurrences: ${niedziela_matches.length}\n`);
niedziela_matches.forEach((m, i) => {
console.log(`Occurrence ${i+1} at position ${m.position}:`);
console.log(` "${m.context}"`);
console.log('');
});
}
await scraper.close();
}
check();

View File

@@ -0,0 +1,34 @@
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function main() {
const totalRes = await pool.query(`SELECT COUNT(*) as total FROM churches WHERE source = 'osm'`);
console.log('Total OSM churches:', totalRes.rows[0].total);
const countryRes = await pool.query(`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country ORDER BY count DESC LIMIT 40`);
console.log('\nTop 40 countries by OSM church count:');
for (const row of countryRes.rows) {
console.log(` ${row.country}: ${row.count}`);
}
// Check key countries that were under-imported
const keyCountries = ['AT','HR','UA','RO','LV','BY','RS','BA','MK','AL','EE','GE','AM','RU','IN','JP','CA','US','MX','AR','CO','ID','CN'];
const keyRes = await pool.query(`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country = ANY($1) GROUP BY country ORDER BY count DESC`, [keyCountries]);
console.log('\nKey countries to check (were under-imported):');
const found = new Map(keyRes.rows.map((r: any) => [r.country, r.count]));
for (const c of keyCountries) {
console.log(` ${c}: ${found.get(c) || 0}`);
}
// Total countries
const countriesRes = await pool.query(`SELECT COUNT(DISTINCT country) as total FROM churches WHERE source = 'osm'`);
console.log(`\nTotal countries with OSM data: ${countriesRes.rows[0].total}`);
await pool.end();
}
main();

View File

@@ -0,0 +1,88 @@
#!/usr/bin/env tsx
/**
* Check production database (Neon) for data
* Run with: npx tsx scripts/check-production-db.ts
*/
import { Pool } from 'pg';
import { config } from 'dotenv';
// Load environment variables (.env.local overrides .env)
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
console.error('❌ DATABASE_URL not found in environment');
process.exit(1);
}
console.log('🔍 Checking production database...');
console.log('📍 Connection:', connectionString.includes('neon.tech') ? 'Neon (Production)' : 'localhost');
const pool = new Pool({ connectionString });
async function checkDatabase() {
try {
// Test connection
console.log('\n1⃣ Testing database connection...');
await pool.query('SELECT NOW()');
console.log('✅ Database connection successful');
// Check tables exist
console.log('\n2⃣ Checking tables...');
const tablesResult = await pool.query(`
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_name
`);
console.log(`✅ Found ${tablesResult.rows.length} tables:`, tablesResult.rows.map(r => r.table_name).join(', '));
// Check churches
console.log('\n3⃣ Checking churches...');
const churchCount = await pool.query('SELECT COUNT(*) FROM "churches"');
console.log(`📊 Churches: ${churchCount.rows[0].count}`);
if (parseInt(churchCount.rows[0].count) > 0) {
const sampleChurch = await pool.query('SELECT id, name, city, state, latitude, longitude FROM "churches" LIMIT 1');
console.log('📍 Sample church:', sampleChurch.rows[0]);
} else {
console.log('⚠️ No churches found in database!');
}
// Check mass schedules
console.log('\n4⃣ Checking mass schedules...');
const massCount = await pool.query('SELECT COUNT(*) FROM "mass_schedules"');
console.log(`📊 Mass schedules: ${massCount.rows[0].count}`);
// Check liturgical days
console.log('\n5⃣ Checking liturgical days...');
const liturgicalCount = await pool.query('SELECT COUNT(*) FROM "liturgical_days"');
console.log(`📊 Liturgical days: ${liturgicalCount.rows[0].count}`);
// Check today's liturgical data
const today = new Date().toISOString().split('T')[0];
const todayData = await pool.query(
'SELECT * FROM "liturgical_days" WHERE date = $1',
[today]
);
if (todayData.rows.length > 0) {
console.log(`✅ Today's liturgical data exists:`, todayData.rows[0].season);
} else {
console.log(`⚠️ No liturgical data for today (${today})`);
}
console.log('\n✨ Database check complete!\n');
} catch (error) {
console.error('❌ Error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
checkDatabase();

View File

@@ -0,0 +1,164 @@
import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function checkScraperStatus() {
try {
console.log('Checking mass schedule scraper status...\n');
// Overall church stats
const totalChurches = await prisma.church.count();
const churchesWithWebsites = await prisma.church.count({
where: {
OR: [
{ website: { not: null } },
{ massScheduleUrl: { not: null } },
],
},
});
const churchesScraped = await prisma.church.count({
where: { lastScrapedAt: { not: null } },
});
// Mass schedule stats
const totalMassSchedules = await prisma.massSchedule.count();
const churchesWithSchedules = await prisma.church.count({
where: {
massSchedules: {
some: {},
},
},
});
// Recently scraped (last 7 days)
const weekAgo = new Date();
weekAgo.setDate(weekAgo.getDate() - 7);
const recentlyScraped = await prisma.church.count({
where: {
lastScrapedAt: { gte: weekAgo },
},
});
// Get scraper sources
const bySource = await prisma.church.groupBy({
by: ['source'],
_count: {
id: true,
},
});
console.log('═══════════════════════════════════════════════════════════════');
console.log('CHURCH DATA SOURCES');
console.log('═══════════════════════════════════════════════════════════════');
bySource.forEach((source) => {
const percent = ((source._count.id / totalChurches) * 100).toFixed(1);
console.log(`${source.source.padEnd(12)} | ${String(source._count.id).padStart(7)} churches (${percent}%)`);
});
console.log('');
console.log('═══════════════════════════════════════════════════════════════');
console.log('MASS SCHEDULE SCRAPING STATUS');
console.log('═══════════════════════════════════════════════════════════════');
console.log(`Total churches: ${totalChurches.toLocaleString()}`);
console.log(`Churches with websites: ${churchesWithWebsites.toLocaleString()} (${((churchesWithWebsites / totalChurches) * 100).toFixed(1)}%)`);
console.log(`Churches ever scraped: ${churchesScraped.toLocaleString()} (${((churchesScraped / totalChurches) * 100).toFixed(1)}%)`);
console.log(`Churches with mass schedules: ${churchesWithSchedules.toLocaleString()} (${((churchesWithSchedules / totalChurches) * 100).toFixed(1)}%)`);
console.log(`Total mass schedules: ${totalMassSchedules.toLocaleString()}`);
console.log('');
console.log(`Scraped in last 7 days: ${recentlyScraped.toLocaleString()}`);
console.log('');
// Average schedules per church
if (churchesWithSchedules > 0) {
const avgSchedules = totalMassSchedules / churchesWithSchedules;
console.log(`Average schedules per church: ${avgSchedules.toFixed(1)} masses/week`);
console.log('');
}
// Get sample of recently scraped churches
const recentSample = await prisma.church.findMany({
where: {
lastScrapedAt: { not: null },
},
select: {
name: true,
city: true,
state: true,
country: true,
lastScrapedAt: true,
website: true,
source: true,
_count: {
select: {
massSchedules: true,
},
},
},
orderBy: { lastScrapedAt: 'desc' },
take: 10,
});
console.log('═══════════════════════════════════════════════════════════════');
console.log('RECENTLY SCRAPED CHURCHES (Last 10)');
console.log('═══════════════════════════════════════════════════════════════');
if (recentSample.length === 0) {
console.log('No churches have been scraped yet.');
} else {
recentSample.forEach((church, index) => {
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
console.log(`${index + 1}. ${church.name} (${location})`);
console.log(` Source: ${church.source}`);
console.log(` Website: ${church.website || 'None'}`);
console.log(` Last scraped: ${church.lastScrapedAt?.toLocaleString() || 'Never'}`);
console.log(` Mass schedules: ${church._count.massSchedules}`);
console.log('');
});
}
// Churches ready to scrape (have website, not scraped)
const readyToScrape = await prisma.church.count({
where: {
OR: [
{ website: { not: null } },
{ massScheduleUrl: { not: null } },
],
lastScrapedAt: null,
},
});
console.log('═══════════════════════════════════════════════════════════════');
console.log('SCRAPING POTENTIAL');
console.log('═══════════════════════════════════════════════════════════════');
console.log(`Churches ready to scrape: ${readyToScrape.toLocaleString()}`);
console.log(` (have website, never scraped)`);
console.log('');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
checkScraperStatus();

View File

@@ -0,0 +1,47 @@
import { Pool } from 'pg';
async function getColumns(pool: Pool, table: string) {
const result = await pool.query(
`SELECT column_name, data_type FROM information_schema.columns WHERE table_name = $1 ORDER BY ordinal_position`,
[table]
);
return result.rows;
}
async function run() {
const nas = new Pool({ connectionString: 'postgresql://postgres:postgres@192.168.0.145:5434/nearestmass' });
const neon = new Pool({
connectionString: 'postgresql://neondb_owner:npg_sX8dxFg9KZIR@ep-plain-sky-ah15xa97-pooler.c-3.us-east-1.aws.neon.tech/neondb?sslmode=require',
ssl: { rejectUnauthorized: false },
});
for (const table of ['churches', 'mass_schedules', 'confession_schedules', 'adoration_schedules']) {
const nasCols = await getColumns(nas, table);
const neonCols = await getColumns(neon, table);
const nasNames = new Set(nasCols.map((c) => c.column_name));
const neonNames = new Set(neonCols.map((c) => c.column_name));
const onlyNas = nasCols.filter((c) => !neonNames.has(c.column_name));
const onlyNeon = neonCols.filter((c) => !nasNames.has(c.column_name));
if (onlyNas.length > 0 || onlyNeon.length > 0) {
console.log(`\n=== ${table} ===`);
if (onlyNas.length) {
console.log(' NAS only:');
for (const c of onlyNas) console.log(` - ${c.column_name} (${c.data_type})`);
}
if (onlyNeon.length) {
console.log(' Neon only:');
for (const c of onlyNeon) console.log(` - ${c.column_name} (${c.data_type})`);
}
} else {
console.log(`\n=== ${table} === (schemas match)`);
}
}
await nas.end();
await neon.end();
}
run();

View File

@@ -0,0 +1,48 @@
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function main() {
const c = await pool.connect();
const total = await c.query('SELECT count(*) FROM "Church"');
console.log('\n=== DATABASE OVERVIEW ===');
console.log('Churches total:', Number(total.rows[0].count).toLocaleString());
const withWebsite = await c.query('SELECT count(*) FROM "Church" WHERE website IS NOT NULL');
console.log('With website:', Number(withWebsite.rows[0].count).toLocaleString());
const withSchedules = await c.query('SELECT count(DISTINCT "churchId") FROM "MassSchedule"');
console.log('With mass schedules:', Number(withSchedules.rows[0].count).toLocaleString());
const enrichedGoogle = await c.query('SELECT count(*) FROM "Church" WHERE "googlePlaceId" IS NOT NULL');
console.log('Google Places enriched:', Number(enrichedGoogle.rows[0].count).toLocaleString());
const totalSchedules = await c.query('SELECT count(*) FROM "MassSchedule"');
console.log('Total mass schedules:', Number(totalSchedules.rows[0].count).toLocaleString());
const countries = await c.query('SELECT country, count(*) as cnt FROM "Church" GROUP BY country ORDER BY cnt DESC LIMIT 15');
console.log('\n=== TOP COUNTRIES ===');
for (const r of countries.rows) console.log(' ' + (r.country || '(null)') + ':', Number(r.cnt).toLocaleString());
const sources = await c.query('SELECT source, count(*) as cnt FROM "Church" GROUP BY source ORDER BY cnt DESC LIMIT 10');
console.log('\n=== CHURCH SOURCES ===');
for (const r of sources.rows) console.log(' ' + (r.source || '(null)') + ':', Number(r.cnt).toLocaleString());
const lastScrape = await c.query('SELECT "lastScrapedAt" FROM "Church" WHERE "lastScrapedAt" IS NOT NULL ORDER BY "lastScrapedAt" DESC LIMIT 1');
console.log('\n=== LAST SCRAPE ===');
console.log(lastScrape.rows[0]?.lastScrapedAt || 'No scrapes yet');
const jobs = await c.query('SELECT status, count(*) as cnt FROM "ScrapeJob" GROUP BY status ORDER BY cnt DESC');
console.log('\n=== JOB STATUS ===');
for (const r of jobs.rows) console.log(' ' + r.status + ':', Number(r.cnt).toLocaleString());
const schedulesByLang = await c.query('SELECT language, count(*) as cnt FROM "MassSchedule" GROUP BY language ORDER BY cnt DESC LIMIT 10');
console.log('\n=== SCHEDULES BY LANGUAGE ===');
for (const r of schedulesByLang.rows) console.log(' ' + (r.language || '(null)') + ':', Number(r.cnt).toLocaleString());
c.release();
await pool.end();
}
main().catch(e => { console.error(e.message); process.exit(1); });

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env tsx
/**
* Debug a specific French page to see why scraping failed
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function debugPage() {
const url = 'https://www.chemin-neuf.fr/'; // Last failed church
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
const result = await scraper.scrape(url);
console.log(`Success: ${result.success}`);
console.log(`Schedules found: ${result.schedules.length}`);
if (result.error) console.log(`Error: ${result.error}`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
console.log('\n=== Page Text Sample (first 2000 chars) ===');
console.log(text.substring(0, 2000));
console.log('\n');
// Check for French day names
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
console.log('=== French day names found ===');
for (const day of frenchDays) {
if (text.includes(day)) {
console.log(`✓ Found: ${day}`);
}
}
// Check for time patterns
console.log('\n=== Time patterns (sample) ===');
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:AM|PM|am|pm|Uhr|uur|h)?/g;
const times = text.match(timeRegex);
if (times) {
console.log(`Found ${times.length} time-like patterns:`);
console.log(times.slice(0, 20).join(', '));
} else {
console.log('No time patterns found');
}
}
await scraper.close();
}
debugPage().catch(console.error);

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env tsx
/**
* Debug why German church has duplicate schedules
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
// Temporarily patch GenericScraper to log sections
const originalParse = GenericScraper.prototype['parseSchedules'];
GenericScraper.prototype['parseSchedules'] = function(html: string) {
const text = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Call findScheduleSections and log result
const sections = this['findScheduleSections'](text);
console.log('\n=== Sections found ===\n');
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
sections.forEach((section: any, i: number) => {
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
console.log(` Text preview: "${section.text.substring(0, 100)}..."`);
});
console.log(`\nTotal sections: ${sections.length}\n`);
// Continue with normal processing
const result = originalParse.call(this, html);
console.log(`\n=== Extracted times per section ===\n`);
const schedsByDay: Record<number, typeof result> = {};
for (const sched of result) {
if (!schedsByDay[sched.dayOfWeek]) schedsByDay[sched.dayOfWeek] = [];
schedsByDay[sched.dayOfWeek].push(sched);
}
for (let i = 0; i < 7; i++) {
if (schedsByDay[i]) {
console.log(`${dayNames[i]}: ${schedsByDay[i].map(s => s.time).join(', ')}`);
}
}
return result;
};
async function testGerman() {
const url = 'https://www.alterpeter.de/';
console.log(`Testing: ${url}`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape(url);
console.log(`\n=== Final Result ===`);
console.log(`Success: ${result.success}`);
console.log(`Total schedules: ${result.schedules.length}`);
await scraper.close();
}
testGerman().catch(console.error);

View File

@@ -0,0 +1,44 @@
import { chromium } from 'playwright';
async function main() {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
const url = 'https://masstimes.org/search?lat=32.7765&lng=-79.9311&type=parish';
console.log('Loading:', url);
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
// Wait for Angular to render
await page.waitForTimeout(5000);
// Take screenshot
await page.screenshot({ path: '/tmp/masstimes-debug.png', fullPage: true });
console.log('Screenshot saved to /tmp/masstimes-debug.png');
// Get page HTML
const html = await page.content();
console.log('\n--- PAGE HTML (first 5000 chars) ---\n');
console.log(html.substring(0, 5000));
// Try to find any visible text that looks like church names
const visibleText = await page.evaluate(() => {
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
const texts: string[] = [];
let node;
while ((node = walker.nextNode())) {
const text = node.textContent?.trim();
if (text && text.length > 10 && text.length < 100) {
texts.push(text);
}
}
return texts.slice(0, 50);
});
console.log('\n--- VISIBLE TEXT SNIPPETS ---\n');
visibleText.forEach((t, i) => console.log(`${i + 1}. ${t}`));
await browser.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,74 @@
#!/usr/bin/env tsx
/**
* Deep dive into Paróquia da Paz parsing bug
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function debugPaz() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
console.log(`Success: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}\n`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find where days appear
console.log('=== Finding day + time patterns ===\n');
const days = ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'];
for (const day of days) {
const dayIndex = text.indexOf(day);
if (dayIndex !== -1) {
// Show context around the day (100 chars before and 200 after)
const before = Math.max(0, dayIndex - 100);
const after = Math.min(text.length, dayIndex + 200);
const snippet = text.substring(before, after);
console.log(`${day.toUpperCase()}:`);
console.log(` Position: ${dayIndex}`);
console.log(` Context: ...${snippet}...`);
console.log('');
}
}
// Check for "h" time format specifically
console.log('\n=== Checking "h" time format ===');
const hTimeRegex = /(\d{1,2})h(\d{2})?/g;
const hTimes = text.match(hTimeRegex);
if (hTimes) {
console.log(`Found ${hTimes.length} "h" format times:`);
console.log(hTimes.slice(0, 30).join(', '));
}
// Look for schedule structure
console.log('\n=== Looking for schedule structure ===');
const scheduleKeywords = ['horário', 'horario', 'missa', 'missas', 'santa missa'];
for (const keyword of scheduleKeywords) {
const index = text.indexOf(keyword);
if (index !== -1) {
const snippet = text.substring(index, Math.min(text.length, index + 500));
console.log(`\nFound "${keyword}" at position ${index}:`);
console.log(snippet.substring(0, 300));
}
}
}
await scraper.close();
}
debugPaz().catch(console.error);

View File

@@ -0,0 +1,150 @@
#!/usr/bin/env tsx
/**
* Debug the 5 parsing bugs identified in top 5 test
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
// The churches with parsing bugs
const BUG_CHURCHES = [
{ name: 'St. Marien', country: 'DE', searchTerm: 'St. Marien' },
{ name: 'Santuario de Manalagua', country: 'ES', searchTerm: 'Santuario de Manalagua' },
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', searchTerm: 'Najświętszego Serca Pana Jez' },
{ name: 'Paróquia de Nossa Senhora do Desterro', country: 'BR', searchTerm: 'Nossa Senhora do Desterro' },
{ name: 'Paróquia da Paz', country: 'BR', searchTerm: 'Paróquia da Paz' },
];
async function debugBugs() {
console.log('Debugging parsing bugs...\n');
const scraper = new GenericScraper();
await scraper.init();
for (const bug of BUG_CHURCHES) {
console.log('═'.repeat(80));
console.log(`BUG: ${bug.name} (${bug.country})`);
console.log('═'.repeat(80));
const church = await prisma.church.findFirst({
where: {
country: bug.country,
name: { contains: bug.searchTerm },
website: { not: null },
},
});
if (!church) {
console.log(`❌ Church not found in database\n`);
continue;
}
console.log(`Church: ${church.name}`);
console.log(`URL: ${church.website}\n`);
scraper.setCountry(bug.country);
try {
const result = await scraper.scrape(church.website!);
console.log(`Success: ${result.success}`);
console.log(`Schedules found: ${result.schedules.length}`);
if (result.error) console.log(`Error: ${result.error}`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
console.log('\n--- Text Sample (first 1000 chars) ---');
console.log(text.substring(0, 1000));
// Check for day names
console.log('\n--- Day Names Found ---');
const dayPatterns: Record<string, string[]> = {
DE: ['sonntag', 'montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag'],
ES: ['domingo', 'lunes', 'martes', 'miércoles', 'miercoles', 'jueves', 'viernes', 'sábado', 'sabado'],
PL: ['niedziela', 'poniedziałek', 'poniedzialek', 'wtorek', 'środa', 'sroda', 'czwartek', 'piątek', 'piatek', 'sobota'],
BR: ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'],
};
const days = dayPatterns[bug.country] || [];
const foundDays: string[] = [];
for (const day of days) {
if (text.includes(day)) {
foundDays.push(day);
}
}
console.log(`Found: ${foundDays.join(', ') || 'none'}`);
// Check for time patterns
console.log('\n--- Time Patterns Found ---');
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h|uhr)?/gi;
const times = text.match(timeRegex);
if (times) {
const uniqueTimes = [...new Set(times)].slice(0, 20);
console.log(`Found ${times.length} time patterns (showing first 20 unique):`);
console.log(uniqueTimes.join(', '));
} else {
console.log('No time patterns found');
}
// Look for specific mass schedule keywords
console.log('\n--- Mass Schedule Keywords ---');
const keywords: Record<string, string[]> = {
DE: ['gottesdienst', 'messe', 'heilige messe', 'messzeiten'],
ES: ['misa', 'horario', 'eucaristía', 'eucaristia'],
PL: ['msza', 'msze', 'nabożeństwo', 'nabozenstwo'],
BR: ['missa', 'horário', 'horario', 'eucaristia'],
};
const countryKeywords = keywords[bug.country] || [];
const foundKeywords: string[] = [];
for (const keyword of countryKeywords) {
if (text.includes(keyword)) {
foundKeywords.push(keyword);
}
}
console.log(`Found: ${foundKeywords.join(', ') || 'none'}`);
// Look for specific problematic patterns
console.log('\n--- Looking for edge cases ---');
// Check if times and days are separated (not in same section)
const hasTimeBeforeDays = text.indexOf(foundDays[0] || 'zzz') > text.indexOf((times || [])[0] || 'aaa');
console.log(`Times come before days: ${hasTimeBeforeDays ? 'YES (potential issue)' : 'no'}`);
// Check for table structures
const hasTables = text.includes('colspan') || text.includes('rowspan') || (text.match(/\s+\|\s+/g)?.length || 0) > 5;
console.log(`Likely table format: ${hasTables ? 'YES (may need special handling)' : 'no'}`);
// Check for multiple languages on same page
const hasMultiLang = (text.match(/english|español|espanol|portuguese|português|portugues|deutsch|polski/gi)?.length || 0) > 1;
console.log(`Multiple languages: ${hasMultiLang ? 'YES (may confuse parser)' : 'no'}`);
}
console.log('\n');
} catch (err: any) {
console.log(`❌ ERROR: ${err.message}\n`);
}
}
await scraper.close();
await prisma.$disconnect();
await pool.end();
}
debugBugs().catch(console.error);

View File

@@ -0,0 +1,98 @@
#!/usr/bin/env tsx
/**
* Debug the full parsing flow with section detection
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
async function debugFullFlow() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
if (!result.rawHtml) {
console.log('No HTML received');
await scraper.close();
return;
}
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find the schedule section
const scheduleIndex = text.indexOf('segundas, terças');
if (scheduleIndex === -1) {
console.log('Schedule text not found!');
await scraper.close();
return;
}
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
console.log('Schedule snippet from actual HTML:');
console.log(snippet);
console.log('\n');
// Now test section matching on actual text
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing sábados and domingos matches ===\n');
// Test sábados
const sabadosRegex = new RegExp(
`(?:^|\\s|[,;:])sábados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const sabadosMatch = snippet.match(sabadosRegex);
console.log('sábados match:', sabadosMatch ? `Found: "${sabadosMatch[1].substring(0, 50)}"` : 'Not found');
// Test sabados (no accent)
const sabadosRegex2 = new RegExp(
`(?:^|\\s|[,;:])sabados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const sabadosMatch2 = snippet.match(sabadosRegex2);
console.log('sabados match:', sabadosMatch2 ? `Found: "${sabadosMatch2[1].substring(0, 50)}"` : 'Not found');
// Test domingos
const domingosRegex = new RegExp(
`(?:^|\\s|[,;:])domingos[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const domingosMatch = snippet.match(domingosRegex);
console.log('domingos match:', domingosMatch ? `Found: "${domingosMatch[1].substring(0, 50)}"` : 'Not found');
console.log('\n=== Final parsed schedules ===\n');
console.log(`Total: ${result.schedules.length}`);
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
console.log(`${dayNames[i]}: ${byDay[i].length} schedules`);
} else {
console.log(`${dayNames[i]}: 0 schedules ❌`);
}
}
await scraper.close();
}
debugFullFlow().catch(console.error);

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env tsx
/**
* Debug which sections are being found
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Simulate the exact text from the page
const scheduleText = `
horário das missas igreja matriz de santo antônio
segundas, terças, quartas e sextas-feiras: 16h e 18h.
quintas-feiras: 16h e 19h (adoração ao santíssimo 18h).
sábados: 8h, 16h e 18h.
domingos: 8h, 11h, 16h, 18h e 20h.
`.toLowerCase();
console.log('Text to parse:');
console.log(scheduleText);
console.log('');
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== COMMA-SEPARATED GROUP MATCHING ===\n');
const dayGroupRegex = new RegExp(
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
'gi'
);
let groupMatch;
let matchCount = 0;
while ((groupMatch = dayGroupRegex.exec(scheduleText)) !== null) {
matchCount++;
console.log(`Match #${matchCount}:`);
console.log(` Day group: "${groupMatch[1]}"`);
console.log(` Time text: "${groupMatch[2]}"`);
console.log('');
}
console.log('=== INDIVIDUAL DAY MATCHING ===\n');
for (const [dayName, dayIndex] of Object.entries(dayPatterns)) {
const escaped = dayName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = scheduleText.match(regex);
if (match) {
console.log(`Found ${dayName} (day ${dayIndex}):`);
console.log(` Time text: "${match[1].substring(0, 100)}"`);
}
}

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env tsx
/**
* Debug Paróquia da Paz with added logging
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
async function debugPazWithLogging() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
console.log(`Success: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}\n`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Test the regex pattern manually
console.log('=== Testing comma-separated day grouping regex ===\n');
const dayConfigs = getDayNamesForCountry('BR');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('Day patterns:', Object.keys(dayPatterns).join(', '));
console.log('');
// The exact regex from the code
const dayGroupRegex = new RegExp(
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
'gi'
);
console.log('Regex pattern:', dayGroupRegex.source.substring(0, 200) + '...\n');
let groupMatch;
let matchCount = 0;
while ((groupMatch = dayGroupRegex.exec(text)) !== null) {
matchCount++;
console.log(`Match #${matchCount}:`);
console.log(` Full match: "${groupMatch[0].substring(0, 100)}"`);
console.log(` Day group: "${groupMatch[1]}"`);
console.log(` Time text: "${groupMatch[2].substring(0, 50)}"`);
console.log('');
}
if (matchCount === 0) {
console.log('No matches found!\n');
// Try to find the schedule text manually
const scheduleIndex = text.indexOf('segundas, terças');
if (scheduleIndex !== -1) {
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
console.log('Found schedule text at position', scheduleIndex);
console.log('Snippet:', snippet);
console.log('');
// Test if individual day names are matching
console.log('Testing individual day name matches in snippet:');
for (const dayName of sortedDayNames.slice(0, 10)) {
if (snippet.includes(dayName)) {
console.log(` ✓ Found: ${dayName}`);
}
}
}
}
}
await scraper.close();
}
debugPazWithLogging().catch(console.error);

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env tsx
/**
* Debug Polish church in detail
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
async function debugPolish() {
const url = 'http://parafialubojna.pl';
console.log(`Debugging: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('PL');
const result = await scraper.scrape(url);
console.log(`Success: ${result.success}`);
console.log(`Schedules found: ${result.schedules.length}\n`);
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find the schedule section
const scheduleIndex = text.indexOf('msze święte') || text.indexOf('msze swiete');
if (scheduleIndex !== -1) {
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
console.log('Schedule section:');
console.log(snippet);
console.log('\n');
// Test all time pattern matches
console.log('=== Testing time pattern matches ===\n');
// Space separator pattern
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
const spaceMatches = snippet.match(spacePattern);
console.log('Space-separated times (8 00, 9 30):');
console.log(spaceMatches ? spaceMatches.join(', ') : 'none');
console.log('');
// Colon pattern
const colonPattern = /\d{1,2}:\d{2}/g;
const colonMatches = snippet.match(colonPattern);
console.log('Colon times (8:00, 9:30):');
console.log(colonMatches ? colonMatches.join(', ') : 'none');
console.log('');
// Polish day names
console.log('=== Polish day names in snippet ===\n');
const dayConfigs = getDayNamesForCountry('PL');
const dayPatterns = buildDayPatterns(dayConfigs);
for (const [dayName, dayNum] of Object.entries(dayPatterns)) {
if (snippet.includes(dayName)) {
console.log(`Found: ${dayName} (day ${dayNum})`);
}
}
}
}
console.log('\n=== Parsed schedules ===\n');
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
console.log(`${dayNames[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
}
}
await scraper.close();
}
debugPolish().catch(console.error);

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env tsx
/**
* Debug why Sunday and Monday aren't parsing for Polish church
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Exact schedule text from website
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
console.log('Text to parse:');
console.log(text);
console.log('\n');
const dayConfigs = getDayNamesForCountry('PL');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing niedziela (Sunday) ===\n');
// Current regex pattern
const niedziela = 'niedziela';
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = text.match(regex);
if (match) {
console.log(`✓ Matched!`);
console.log(` Full match: "${match[0].substring(0, 100)}"`);
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
console.log('');
// Check if times can be extracted
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
const times = match[1].match(spacePattern);
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ NOT matched`);
}
console.log('\n=== Testing poniedziałek (Monday) ===\n');
const ponieRegex = new RegExp(
`(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const ponieMatch = text.match(ponieRegex);
if (ponieMatch) {
console.log(`✓ Matched!`);
console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`);
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
console.log('');
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ NOT matched`);
}
console.log('\n=== Analyzing why niedziela might fail ===\n');
// The issue might be "niedziela i uroczystości:" - the phrase is long
// Check if the lookahead is hitting "uroczystości" before getting to the times
const niedziela_index = text.indexOf('niedziela');
const next_day_index = Math.min(
...sortedDayNames
.filter(d => d !== 'niedziela')
.map(d => text.indexOf(d, niedziela_index))
.filter(i => i > 0)
);
console.log(`niedziela position: ${niedziela_index}`);
console.log(`Next day name position: ${next_day_index}`);
console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find "montag bis donnerstag" pattern
const pattern = /montag[^]*?bis[^]*?donnerstag/gi;
const matches = [...text.matchAll(pattern)];
console.log(`Found ${matches.length} instances of "montag bis donnerstag":\n`);
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
const matchIndex = match.index || 0;
const contextBefore = text.substring(Math.max(0, matchIndex - 150), matchIndex);
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 250));
console.log(`=== Instance ${i + 1} ===`);
console.log(`Position: ${matchIndex}`);
console.log(`\nContext BEFORE (150 chars):`);
console.log(`"${contextBefore}"`);
console.log(`\nContext AFTER (250 chars):`);
console.log(`"${contextAfter}"`);
console.log('');
}
}
await scraper.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find all instances of "00 uhr" pattern
let idx = 0;
let count = 0;
const pattern = /\b00\s*uhr/g;
let match;
console.log('Looking for "00 uhr" patterns:\n');
while ((match = pattern.exec(text)) !== null) {
count++;
const matchIndex = match.index;
const contextBefore = text.substring(Math.max(0, matchIndex - 50), matchIndex);
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 100));
console.log(`=== Occurrence ${count} at position ${matchIndex} ===`);
console.log(`BEFORE: "...${contextBefore}"`);
console.log(`MATCH + AFTER: "${contextAfter}..."`);
console.log('');
}
console.log(`Total "00 uhr" occurrences: ${count}`);
}
await scraper.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env tsx
import { config } from 'dotenv';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import fs from 'fs/promises';
config({ path: '.env.local' });
async function main() {
console.log('📦 Exporting Germany from Neon...');
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
await prisma.$connect();
const churches = await prisma.churches.findMany({
where: { country: 'DE' },
include: {
massSchedules: true,
confessionSchedules: true,
adorationSchedules: true,
}
});
console.log(`Found ${churches.length} churches in Germany`);
await fs.writeFile('export-DE.json', JSON.stringify(churches, null, 2));
console.log(`✅ Exported to export-DE.json`);
await prisma.$disconnect();
await pool.end();
}
main().catch(console.error);

View File

@@ -0,0 +1,60 @@
#!/usr/bin/env tsx
/**
* Export churches from NAS database to JSON
* Run this ON THE NAS (uses DATABASE_URL from .env)
*/
import { PrismaClient } from '@prisma/client';
import fs from 'fs/promises';
async function main() {
const country = process.argv[2] || 'PL';
console.log(`📦 Exporting ${country} data from database...`);
console.log(`DATABASE_URL: ${process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@')}`);
const prisma = new PrismaClient();
try {
await prisma.$connect();
console.log('✅ Connected to database');
// Export churches with all schedules
const churches = await prisma.churches.findMany({
where: { country },
include: {
massSchedules: true,
confessionSchedules: true,
adorationSchedules: true,
}
});
console.log(`Found ${churches.length} churches in ${country}`);
// Count schedules
const massSchedules = churches.reduce((sum, c) => sum + (c.massSchedules?.length || 0), 0);
const confessionSchedules = churches.reduce((sum, c) => sum + (c.confessionSchedules?.length || 0), 0);
const adorationSchedules = churches.reduce((sum, c) => sum + (c.adorationSchedules?.length || 0), 0);
// Save to file
const exportFile = `export-${country}.json`;
await fs.writeFile(exportFile, JSON.stringify(churches, null, 2));
console.log(`\n✅ Exported to ${exportFile}`);
console.log(` - ${churches.length} churches`);
console.log(` - ${massSchedules} mass schedules`);
console.log(` - ${confessionSchedules} confession schedules`);
console.log(` - ${adorationSchedules} adoration schedules`);
console.log(`\nDownload with:`);
console.log(` scp albert@192.168.0.145:/volume1/docker/nearestmass/${exportFile} .`);
await prisma.$disconnect();
} catch (error) {
console.error('❌ Export failed:', error);
await prisma.$disconnect();
process.exit(1);
}
}
main().catch(console.error);

View File

@@ -0,0 +1,230 @@
#!/usr/bin/env tsx
/**
* Export churches from local NAS database and import to Neon
*/
import { PrismaClient } from '@prisma/client';
import fs from 'fs/promises';
import path from 'path';
interface ExportStats {
churches: number;
massSchedules: number;
confessionSchedules: number;
adorationSchedules: number;
}
async function exportFromNAS(country: string): Promise<ExportStats> {
console.log(`📦 Exporting ${country} data from NAS...`);
// Set DATABASE_URL to NAS
const originalUrl = process.env.DATABASE_URL;
process.env.DATABASE_URL = 'postgresql://postgres:postgres@192.168.0.145:5432/nearestmass';
const nasPrisma = new PrismaClient();
try {
await nasPrisma.$connect();
console.log('✅ Connected to NAS database');
// Export churches with all schedules
const churches = await nasPrisma.churches.findMany({
where: { country },
include: {
massSchedules: true,
confessionSchedules: true,
adorationSchedules: true,
}
});
console.log(`Found ${churches.length} churches in ${country}`);
// Count schedules
const stats: ExportStats = {
churches: churches.length,
massSchedules: churches.reduce((sum, c) => sum + (c.massSchedules?.length || 0), 0),
confessionSchedules: churches.reduce((sum, c) => sum + (c.confessionSchedules?.length || 0), 0),
adorationSchedules: churches.reduce((sum, c) => sum + (c.adorationSchedules?.length || 0), 0),
};
// Save to file
const exportFile = path.join(process.cwd(), `export-${country}.json`);
await fs.writeFile(exportFile, JSON.stringify(churches, null, 2));
console.log(`✅ Exported to ${exportFile}`);
console.log(` - ${stats.churches} churches`);
console.log(` - ${stats.massSchedules} mass schedules`);
console.log(` - ${stats.confessionSchedules} confession schedules`);
console.log(` - ${stats.adorationSchedules} adoration schedules`);
await nasPrisma.$disconnect();
// Restore original DATABASE_URL
if (originalUrl) {
process.env.DATABASE_URL = originalUrl;
}
return stats;
} catch (error) {
console.error('❌ Export failed:', error);
await nasPrisma.$disconnect();
// Restore original DATABASE_URL
if (originalUrl) {
process.env.DATABASE_URL = originalUrl;
}
throw error;
}
}
async function importToNeon(country: string, dryRun: boolean = true): Promise<void> {
console.log(`\n📤 Importing ${country} data to Neon...`);
if (dryRun) {
console.log('🔍 DRY RUN MODE - No data will be written');
}
// Read export file
const exportFile = path.join(process.cwd(), `export-${country}.json`);
const data = JSON.parse(await fs.readFile(exportFile, 'utf-8'));
console.log(`Loaded ${data.length} churches from export file`);
// Connect to Neon
const neonPrisma = new PrismaClient();
try {
await neonPrisma.$connect();
console.log('✅ Connected to Neon database');
let inserted = 0;
let updated = 0;
let errors = 0;
for (const church of data) {
try {
const massSchedules = church.massSchedules || [];
const confessionSchedules = church.confessionSchedules || [];
const adorationSchedules = church.adorationSchedules || [];
// Remove relations and ids
delete church.massSchedules;
delete church.confessionSchedules;
delete church.adorationSchedules;
delete church.id;
if (!dryRun) {
// Upsert church based on coordinates
const result = await neonPrisma.churches.upsert({
where: {
latitude_longitude: {
latitude: church.latitude,
longitude: church.longitude
}
},
create: church,
update: church
});
// Check if it was an insert or update
const existing = await neonPrisma.churches.findFirst({
where: {
latitude: church.latitude,
longitude: church.longitude,
createdAt: { lt: new Date(Date.now() - 1000) } // Created more than 1 sec ago
}
});
if (existing) {
updated++;
} else {
inserted++;
}
// Insert schedules
for (const schedule of massSchedules) {
delete schedule.id;
await neonPrisma.massSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
}
for (const schedule of confessionSchedules) {
delete schedule.id;
await neonPrisma.confessionSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
}
for (const schedule of adorationSchedules) {
delete schedule.id;
await neonPrisma.adorationSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
}
} else {
// Dry run - just count
inserted++;
}
if (inserted % 100 === 0) {
console.log(`Progress: ${inserted + updated} churches processed...`);
}
} catch (error) {
errors++;
console.error(`Error importing church ${church.name}:`, error instanceof Error ? error.message : error);
}
}
console.log('\n✅ Import complete!');
console.log(` - ${inserted} churches inserted`);
console.log(` - ${updated} churches updated`);
console.log(` - ${errors} errors`);
await neonPrisma.$disconnect();
} catch (error) {
console.error('❌ Import failed:', error);
await neonPrisma.$disconnect();
throw error;
}
}
async function main() {
const country = process.argv[2] || 'PL';
const mode = process.argv[3] || 'dry-run';
const dryRun = mode === 'dry-run';
console.log('🌍 Export/Import to Neon');
console.log('========================\n');
try {
// Step 1: Export from NAS
const stats = await exportFromNAS(country);
// Step 2: Import to Neon
await importToNeon(country, dryRun);
if (dryRun) {
console.log('\n💡 This was a DRY RUN. To actually import to Neon, run:');
console.log(` npx tsx scripts/export-import-to-neon.ts ${country} real-import`);
} else {
console.log('\n🎉 Data successfully uploaded to Neon!');
}
} catch (error) {
console.error('❌ Process failed:', error);
process.exit(1);
}
}
main().catch(console.error);

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Find all instances of "donnerstag" (Thursday)
let idx = 0;
let count = 0;
while ((idx = text.indexOf('donnerstag', idx)) !== -1) {
count++;
const contextBefore = text.substring(Math.max(0, idx - 100), idx);
const contextAfter = text.substring(idx, Math.min(text.length, idx + 200));
console.log(`=== Donnerstag occurrence ${count} at position ${idx} ===`);
console.log(`BEFORE: "...${contextBefore}"`);
console.log(`AFTER: "${contextAfter}..."`);
console.log('');
idx++;
}
console.log(`Total "donnerstag" occurrences: ${count}`);
}
await scraper.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,42 @@
#!/usr/bin/env tsx
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function main() {
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('DE');
const result = await scraper.scrape('https://www.alterpeter.de/');
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
const idx = text.indexOf('9.00 12.00');
if (idx !== -1) {
console.log('Context around "9.00 12.00":');
console.log(text.substring(Math.max(0, idx - 150), idx + 200));
} else {
console.log('Pattern "9.00 12.00" not found');
// Try alternative patterns
const patterns = ['9.00', '9:00', '09:00', '09.00'];
for (const pattern of patterns) {
const idx2 = text.indexOf(pattern);
if (idx2 !== -1) {
console.log(`\nFound "${pattern}" at position ${idx2}:`);
console.log(text.substring(Math.max(0, idx2 - 100), idx2 + 150));
break;
}
}
}
}
await scraper.close();
}
main().catch(console.error);

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env tsx
/**
* Identify which churches are flagged as "parsing bugs" in top 5 test
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const COUNTRIES = [
{ code: 'FR', name: 'France' },
{ code: 'DE', name: 'Germany' },
{ code: 'ES', name: 'Spain' },
{ code: 'PL', name: 'Poland' },
{ code: 'BR', name: 'Brazil' },
];
async function identifyBugs() {
console.log('Identifying "parsing bugs" from top 5 test...\n');
const scraper = new GenericScraper();
await scraper.init();
const bugs: Array<{
country: string;
church: string;
url: string;
hasDays: boolean;
hasTimes: boolean;
}> = [];
for (const country of COUNTRIES) {
const churches = await prisma.church.findMany({
where: {
country: country.code,
website: { not: null },
source: 'osm',
},
take: 10,
orderBy: { createdAt: 'asc' },
});
scraper.setCountry(country.code);
for (const church of churches) {
try {
const result = await scraper.scrape(church.website!);
if (!result.success && result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check for day names and times
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|domingos|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|sábados|sabados|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|segundas|terça|terca|terças|tercas|quarta|quartas|quinta|quintas|sexta|sextas)\b/i);
const hasTimes = text.match(/\d{1,2}[h:\.]?\s*\d{0,2}\s*(am|pm|h|uhr)?/i);
if (hasDays && hasTimes) {
bugs.push({
country: country.name,
church: church.name,
url: church.website!,
hasDays: !!hasDays,
hasTimes: !!hasTimes,
});
}
}
} catch (err: any) {
// Skip errors
}
}
}
await scraper.close();
console.log(`\n${'='.repeat(80)}`);
console.log(`FOUND ${bugs.length} POTENTIAL PARSING BUGS\n`);
bugs.forEach((bug, i) => {
console.log(`${i + 1}. ${bug.church} (${bug.country})`);
console.log(` URL: ${bug.url}`);
console.log('');
});
await prisma.$disconnect();
await pool.end();
}
identifyBugs().catch(console.error);

View File

@@ -0,0 +1,232 @@
#!/usr/bin/env tsx
/**
* Import churches from JSON export to Neon database
* Run this LOCALLY (uses DATABASE_URL from .env pointing to Neon)
*/
import { PrismaClient } from '@prisma/client';
import fs from 'fs/promises';
import path from 'path';
interface ChurchExport {
id: string;
name: string;
latitude: number;
longitude: number;
country: string;
massSchedules?: any[];
confessionSchedules?: any[];
adorationSchedules?: any[];
[key: string]: any;
}
async function main() {
const country = process.argv[2] || 'PL';
const mode = process.argv[3] || 'dry-run';
const dryRun = mode === 'dry-run';
console.log(`📤 Importing ${country} data to Neon...`);
console.log(`DATABASE_URL: ${process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@')}`);
if (dryRun) {
console.log('🔍 DRY RUN MODE - No data will be written');
}
// Read export file
const exportFile = path.join(process.cwd(), `export-${country}.json`);
try {
const data: ChurchExport[] = JSON.parse(await fs.readFile(exportFile, 'utf-8'));
console.log(`Loaded ${data.length} churches from export file`);
// Connect to Neon
const prisma = new PrismaClient();
try {
await prisma.$connect();
console.log('✅ Connected to Neon database');
let inserted = 0;
let updated = 0;
let skipped = 0;
let errors = 0;
let totalMassSchedules = 0;
let totalConfessionSchedules = 0;
let totalAdorationSchedules = 0;
for (const church of data) {
try {
const massSchedules = church.massSchedules || [];
const confessionSchedules = church.confessionSchedules || [];
const adorationSchedules = church.adorationSchedules || [];
// Remove relations and ids
delete church.massSchedules;
delete church.confessionSchedules;
delete church.adorationSchedules;
delete church.id;
if (!dryRun) {
// Check if church already exists
const existing = await prisma.churches.findFirst({
where: {
latitude: church.latitude,
longitude: church.longitude
}
});
if (existing) {
// Update existing church
await prisma.churches.update({
where: { id: existing.id },
data: church
});
// Delete existing schedules
await prisma.massSchedules.deleteMany({
where: { churchId: existing.id }
});
await prisma.confessionSchedules.deleteMany({
where: { churchId: existing.id }
});
await prisma.adorationSchedules.deleteMany({
where: { churchId: existing.id }
});
// Insert new schedules
for (const schedule of massSchedules) {
delete schedule.id;
await prisma.massSchedules.create({
data: {
...schedule,
churchId: existing.id
}
});
totalMassSchedules++;
}
for (const schedule of confessionSchedules) {
delete schedule.id;
await prisma.confessionSchedules.create({
data: {
...schedule,
churchId: existing.id
}
});
totalConfessionSchedules++;
}
for (const schedule of adorationSchedules) {
delete schedule.id;
await prisma.adorationSchedules.create({
data: {
...schedule,
churchId: existing.id
}
});
totalAdorationSchedules++;
}
updated++;
} else {
// Create new church
const result = await prisma.churches.create({
data: church
});
// Insert schedules
for (const schedule of massSchedules) {
delete schedule.id;
await prisma.massSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
totalMassSchedules++;
}
for (const schedule of confessionSchedules) {
delete schedule.id;
await prisma.confessionSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
totalConfessionSchedules++;
}
for (const schedule of adorationSchedules) {
delete schedule.id;
await prisma.adorationSchedules.create({
data: {
...schedule,
churchId: result.id
}
});
totalAdorationSchedules++;
}
inserted++;
}
} else {
// Dry run - just count
inserted++;
totalMassSchedules += massSchedules.length;
totalConfessionSchedules += confessionSchedules.length;
totalAdorationSchedules += adorationSchedules.length;
}
if ((inserted + updated) % 100 === 0) {
console.log(`Progress: ${inserted + updated} churches processed...`);
}
} catch (error) {
errors++;
console.error(`Error importing church ${church.name}:`, error instanceof Error ? error.message : error);
}
}
console.log('\n✅ Import complete!');
console.log(` - ${inserted} churches inserted`);
console.log(` - ${updated} churches updated`);
console.log(` - ${skipped} churches skipped`);
console.log(` - ${errors} errors`);
console.log(` - ${totalMassSchedules} mass schedules`);
console.log(` - ${totalConfessionSchedules} confession schedules`);
console.log(` - ${totalAdorationSchedules} adoration schedules`);
await prisma.$disconnect();
if (dryRun) {
console.log('\n💡 This was a DRY RUN. To actually import to Neon, run:');
console.log(` npx tsx scripts/import-to-neon.ts ${country} real-import`);
} else {
console.log('\n🎉 Data successfully uploaded to Neon!');
}
} catch (error) {
console.error('❌ Import failed:', error);
await prisma.$disconnect();
throw error;
}
} catch (error) {
if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
console.error(`❌ Export file not found: ${exportFile}`);
console.error(`\nFirst, export data from NAS:`);
console.error(` ssh albert@192.168.0.145`);
console.error(` cd /volume1/docker/nearestmass`);
console.error(` /usr/local/bin/docker compose --profile tools run --rm scraper npx tsx scripts/export-from-nas.ts ${country}`);
console.error(`\nThen download the export:`);
console.error(` scp albert@192.168.0.145:/volume1/docker/nearestmass/export-${country}.json .`);
console.error(`\nFinally, run this import script again.`);
} else {
console.error('❌ Process failed:', error);
}
process.exit(1);
}
}
main().catch(console.error);

View File

@@ -0,0 +1,84 @@
#!/usr/bin/env tsx
/**
* Investigate the 8 potential parsing bugs
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const BUGS = [
{ name: 'Chapelle Saint-Jean-XXIII', country: 'FR', url: 'https://www.chemin-neuf.fr/' },
{ name: 'St. Marien', country: 'DE', url: 'https://www.willehad.de/start/' },
{ name: 'Iglesia de San Fernando', country: 'ES', url: 'https://www.parroquiasanfernandomaspalomas.net/de/' },
{ name: 'Monestir de Sant Esperit', country: 'ES', url: 'https://www.santoespiritu.org/' },
{ name: 'Santuario de Manalagua', country: 'ES', url: 'http://tierrasdeburgos.blogspot.com.es/2013/12/escultura-del-agua-santuario-de.html' },
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', url: 'http://parafialubojna.pl' },
{ name: 'Paróquia do Desterro', country: 'BR', url: 'https://paroquiaportodegalinhas.blogspot.com.br/' },
{ name: 'Catedral Diocesana', country: 'BR', url: 'http://diocesedejuazeiro.org.br/' },
];
async function investigate() {
console.log('Investigating 8 potential bugs...\n');
const scraper = new GenericScraper();
await scraper.init();
for (let i = 0; i < BUGS.length; i++) {
const bug = BUGS[i];
console.log(`${'='.repeat(80)}`);
console.log(`${i + 1}. ${bug.name} (${bug.country})`);
console.log(` ${bug.url}`);
console.log('='.repeat(80));
scraper.setCountry(bug.country);
try {
const result = await scraper.scrape(bug.url);
console.log(`Success: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}`);
console.log(`Error: ${result.error || 'none'}`);
if (!result.success && result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check page type
console.log('\nPage analysis:');
if (text.includes('blogspot')) {
console.log(' ⚠️ Blogspot page (likely blog post, not church website)');
}
if (text.includes('hotel') || text.includes('reservation') || text.includes('booking')) {
console.log(' ⚠️ Contains hotel/booking keywords');
}
if (text.includes('restaurant') || text.includes('menu')) {
console.log(' ⚠️ Contains restaurant keywords');
}
if (text.includes('404') || text.includes('not found') || text.includes('error')) {
console.log(' ⚠️ Error/404 page');
}
// Check if it has schedule keywords
const hasScheduleKeywords = text.match(/(mass|messe|misa|missa|horário|horario|gottesdienst|eucarist)/i);
console.log(` Schedule keywords: ${hasScheduleKeywords ? '✓ Found' : '✗ Not found'}`);
// Show sample text
const massIndex = text.indexOf('mass') || text.indexOf('messe') || text.indexOf('misa') || text.indexOf('missa') || 0;
const sampleStart = Math.max(0, massIndex - 50);
const sample = text.substring(sampleStart, sampleStart + 300);
console.log(`\n Sample text: "${sample.substring(0, 200)}..."`);
}
console.log('\n');
} catch (err: any) {
console.log(`ERROR: ${err.message}\n\n`);
}
}
await scraper.close();
}
investigate().catch(console.error);

View File

@@ -0,0 +1,134 @@
import { config } from 'dotenv';
import { PrismaClient } from '@prisma/client';
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
// Load .env.local first, then .env
config({ path: '.env.local' });
config({ path: '.env' });
const connectionString = process.env.DATABASE_URL;
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is not set');
}
const pool = new Pool({ connectionString });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function listChurchWebsites() {
try {
console.log('Fetching churches from database...\n');
const churches = await prisma.church.findMany({
select: {
id: true,
name: true,
city: true,
state: true,
country: true,
website: true,
googlePlaceId: true,
},
orderBy: [
{ country: 'asc' },
{ state: 'asc' },
{ city: 'asc' },
],
});
console.log(`Total churches: ${churches.length}`);
const withWebsite = churches.filter(c => c.website);
const withGoogle = churches.filter(c => c.googlePlaceId);
const withoutWebsite = churches.filter(c => !c.website);
console.log(`Churches with website: ${withWebsite.length}`);
console.log(`Churches with Google Place ID: ${withGoogle.length}`);
console.log(`Churches without website: ${withoutWebsite.length}\n`);
// Group by country
const byCountry = churches.reduce((acc, church) => {
const country = church.country || 'Unknown';
if (!acc[country]) {
acc[country] = [];
}
acc[country].push(church);
return acc;
}, {} as Record<string, typeof churches>);
// Write to file
let output = '# Church Websites\n\n';
output += `Generated: ${new Date().toISOString()}\n\n`;
output += `## Summary\n`;
output += `- Total churches: ${churches.length}\n`;
output += `- With website: ${withWebsite.length} (${((withWebsite.length / churches.length) * 100).toFixed(1)}%)\n`;
output += `- With Google Place ID: ${withGoogle.length} (${((withGoogle.length / churches.length) * 100).toFixed(1)}%)\n`;
output += `- Without website: ${withoutWebsite.length} (${((withoutWebsite.length / churches.length) * 100).toFixed(1)}%)\n\n`;
// Add country breakdown
output += `## By Country\n\n`;
Object.entries(byCountry)
.sort(([, a], [, b]) => b.length - a.length)
.forEach(([country, countryChurches]) => {
const withSite = countryChurches.filter(c => c.website).length;
const withGoogle = countryChurches.filter(c => c.googlePlaceId).length;
output += `### ${country} (${countryChurches.length} churches)\n`;
output += `- With website: ${withSite} (${((withSite / countryChurches.length) * 100).toFixed(1)}%)\n`;
output += `- With Google Place ID: ${withGoogle} (${((withGoogle / countryChurches.length) * 100).toFixed(1)}%)\n\n`;
});
// List all websites
output += `## All Websites\n\n`;
Object.entries(byCountry)
.sort(([a], [b]) => a.localeCompare(b))
.forEach(([country, countryChurches]) => {
output += `### ${country}\n\n`;
countryChurches.forEach(church => {
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
if (church.website) {
output += `- **${church.name}** (${location})\n`;
output += ` - Website: ${church.website}\n`;
if (church.googlePlaceId) {
output += ` - Google Place ID: ${church.googlePlaceId}\n`;
}
output += ` - DB ID: ${church.id}\n\n`;
}
});
});
// List churches without websites
output += `## Churches Without Websites\n\n`;
Object.entries(byCountry)
.sort(([a], [b]) => a.localeCompare(b))
.forEach(([country, countryChurches]) => {
const without = countryChurches.filter(c => !c.website);
if (without.length > 0) {
output += `### ${country}\n\n`;
without.forEach(church => {
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
output += `- **${church.name}** (${location})\n`;
if (church.googlePlaceId) {
output += ` - Google Place ID: ${church.googlePlaceId}\n`;
}
output += ` - DB ID: ${church.id}\n\n`;
});
}
});
// Write to file
const fs = await import('fs/promises');
await fs.writeFile('church-websites.md', output);
console.log('✓ Written to church-websites.md');
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await prisma.$disconnect();
await pool.end();
}
}
listChurchWebsites();

View File

@@ -0,0 +1,44 @@
import { Pool } from 'pg';
import * as dotenv from 'dotenv';
import * as path from 'path';
// Load .env.local first (takes precedence), then .env
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
});
async function listTables() {
try {
console.log('Connecting to database...');
console.log('DATABASE_URL:', process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@'));
// List all tables
const result = await pool.query(`
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_name;
`);
console.log('\n=== Tables in Database ===');
if (result.rows.length === 0) {
console.log('No tables found!');
} else {
result.rows.forEach((row) => {
console.log(`- ${row.table_name}`);
});
}
console.log(`\nTotal: ${result.rows.length} tables`);
} catch (error) {
console.error('Error listing tables:', error);
} finally {
await pool.end();
}
}
listTables();

View File

@@ -0,0 +1,167 @@
const { Client } = require("pg");
const client = new Client({
connectionString: "postgresql://postgres:postgres@192.168.0.145:5434/nearestmass"
});
const queries = [
{
name: "1. Overall church counts by country (top 20)",
sql: `SELECT country, COUNT(*) as total,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped,
COUNT(*) FILTER (WHERE has_website = true) as has_website_flag,
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_language
FROM churches
GROUP BY country
ORDER BY total DESC
LIMIT 20`
},
{
name: "2. Total mass schedule counts",
sql: `SELECT COUNT(*) as total_schedules,
COUNT(DISTINCT church_id) as churches_with_schedules
FROM mass_schedules`
},
{
name: "3. Scrape results by language",
sql: `SELECT website_language as language,
COUNT(*) as total_scraped,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
FROM churches
WHERE website_language IS NOT NULL
GROUP BY website_language
ORDER BY total_scraped DESC`
},
{
name: "4. Churches with websites but never scraped",
sql: `SELECT COUNT(*) as has_website_not_scraped
FROM churches
WHERE website IS NOT NULL AND last_scraped_at IS NULL`
},
{
name: "5. Overall pipeline funnel",
sql: `SELECT
COUNT(*) as total_churches,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as attempted_scrape,
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_detected_language,
(SELECT COUNT(DISTINCT church_id) FROM mass_schedules) as has_schedules_saved,
(SELECT COUNT(*) FROM mass_schedules) as total_schedule_rows
FROM churches`
},
{
name: "6. Recent scrape activity (last 7 days) by language",
sql: `SELECT website_language as language,
COUNT(*) as scraped_last_7d
FROM churches
WHERE last_scraped_at > NOW() - INTERVAL '7 days'
GROUP BY website_language
ORDER BY scraped_last_7d DESC`
},
{
name: "7. Background job history (last 15 completed/failed jobs)",
sql: `SELECT type, language, status,
created_at::date as created,
completed_at::date as completed,
ROUND(CAST(EXTRACT(EPOCH FROM (completed_at - created_at))/3600 AS numeric), 2) as hours,
total_items, processed, succeeded, failed
FROM background_jobs
WHERE status IN ('completed', 'failed')
ORDER BY completed_at DESC
LIMIT 15`
},
{
name: "8. Mass schedule breakdown by day of week",
sql: `SELECT day_of_week,
CASE day_of_week
WHEN 0 THEN 'Sunday' WHEN 1 THEN 'Monday' WHEN 2 THEN 'Tuesday'
WHEN 3 THEN 'Wednesday' WHEN 4 THEN 'Thursday' WHEN 5 THEN 'Friday'
WHEN 6 THEN 'Saturday' ELSE 'Other'
END as day_name,
COUNT(*) as count
FROM mass_schedules
GROUP BY day_of_week
ORDER BY day_of_week`
},
{
name: "9. Churches with schedules by country (top 15)",
sql: `SELECT c.country,
COUNT(DISTINCT c.id) as total_churches,
COUNT(DISTINCT ms.church_id) as churches_with_schedules,
ROUND(100.0 * COUNT(DISTINCT ms.church_id) / NULLIF(COUNT(DISTINCT c.id), 0), 1) as coverage_pct,
COUNT(ms.id) as total_schedule_rows
FROM churches c
LEFT JOIN mass_schedules ms ON ms.church_id = c.id
GROUP BY c.country
ORDER BY total_churches DESC
LIMIT 15`
},
{
name: "10. Enrichment sources - how churches were found",
sql: `SELECT source, COUNT(*) as count
FROM churches
GROUP BY source
ORDER BY count DESC`
},
{
name: "11. Google Places enrichment impact",
sql: `SELECT
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL AND website IS NOT NULL) as google_with_website,
COUNT(*) FILTER (WHERE google_place_id IS NULL) as no_google_place,
COUNT(*) FILTER (WHERE google_searched_at IS NOT NULL) as google_searched,
COUNT(*) FILTER (WHERE free_searched_at IS NOT NULL) as free_searched
FROM churches`
},
{
name: "12. Website presence by source",
sql: `SELECT source,
COUNT(*) as total,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
ROUND(100.0 * COUNT(*) FILTER (WHERE website IS NOT NULL) / NULLIF(COUNT(*), 0), 1) as website_pct,
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
FROM churches
GROUP BY source
ORDER BY total DESC`
}
];
async function run() {
await client.connect();
for (const q of queries) {
console.log("=".repeat(90));
console.log(q.name);
console.log("=".repeat(90));
try {
const res = await client.query(q.sql);
if (res.rows.length === 0) {
console.log("(no rows returned)");
} else {
// Calculate column widths
const cols = Object.keys(res.rows[0]);
const widths = cols.map(c => {
const maxData = Math.max(...res.rows.map(r => String(r[c] ?? "NULL").length));
return Math.max(c.length, maxData);
});
// Print header
console.log(cols.map((c, i) => c.padEnd(widths[i])).join(" | "));
console.log(widths.map(w => "-".repeat(w)).join("-+-"));
// Print rows
for (const row of res.rows) {
console.log(cols.map((c, i) => String(row[c] ?? "NULL").padEnd(widths[i])).join(" | "));
}
}
console.log("\n(" + res.rows.length + " rows)\n");
} catch (err) {
console.log("ERROR:", err.message, "\n");
}
}
await client.end();
}
run().catch(e => { console.error(e); process.exit(1); });

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env tsx
/**
* Show detailed output from a successful French parse
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function showSuccess() {
// One of our successful churches with 16 schedules
const url = 'https://laportelatine.org/lieux/couvent-saint-francois-morgon';
console.log(`Detailed parse of: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
const result = await scraper.scrape(url);
console.log(`✅ Success: ${result.success}`);
console.log(`📅 Schedules found: ${result.schedules.length}\n`);
// Group by day
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Dimanche', 'Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi'];
console.log('═══════════════════════════════════════════════');
console.log('PARSED SCHEDULE:');
console.log('═══════════════════════════════════════════════\n');
Object.entries(byDay)
.sort(([a], [b]) => parseInt(a) - parseInt(b))
.forEach(([day, scheds]) => {
console.log(`${dayNames[parseInt(day)]}:`);
scheds.forEach(s => {
console.log(` ${s.time} - ${s.language} ${s.massType}`);
});
console.log('');
});
await scraper.close();
}
showSuccess().catch(console.error);

View File

@@ -0,0 +1,28 @@
#!/usr/bin/env tsx
/**
* Test database connection
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
console.log('DATABASE_URL exists:', !!process.env.DATABASE_URL);
console.log('DATABASE_URL value:', process.env.DATABASE_URL?.substring(0, 50) + '...');
import { prisma } from '../../src/lib/db';
async function testConnection() {
try {
const count = await prisma.church.count();
console.log(`✅ Database connection successful!`);
console.log(`Total churches in database: ${count}`);
} catch (err: any) {
console.log(`❌ Database connection failed:`);
console.log(err.message);
} finally {
await prisma.$disconnect();
}
}
testConnection();

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env tsx
/**
* Test more French churches and collect diagnostic data
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface DiagnosticInfo {
url: string;
churchName: string;
success: boolean;
schedulesFound: number;
hasFrenchDays: boolean;
hasTimePatterns: boolean;
timePatternsSample: string[];
textSample: string;
error?: string;
}
async function testFrenchBroader() {
console.log('Testing 20 French churches with diagnostics...\n');
// Get more French churches
const churches = await prisma.church.findMany({
where: {
country: 'FR',
website: { not: null },
source: 'osm',
},
take: 20,
orderBy: { createdAt: 'asc' },
});
if (churches.length === 0) {
console.log('No French churches found.');
await prisma.$disconnect();
await pool.end();
return;
}
console.log(`Found ${churches.length} French churches to test\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
let successCount = 0;
let failCount = 0;
const diagnostics: DiagnosticInfo[] = [];
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name} (${church.city || 'Unknown'})`);
console.log(`URL: ${church.website}`);
try {
const result = await scraper.scrape(church.website!);
// Extract diagnostics
let hasFrenchDays = false;
let hasTimePatterns = false;
let timePatternsSample: string[] = [];
let textSample = '';
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
textSample = text.substring(0, 500);
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
hasFrenchDays = frenchDays.some(day => text.includes(day));
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h)?/g;
const times = text.match(timeRegex);
if (times) {
hasTimePatterns = true;
timePatternsSample = [...new Set(times)].slice(0, 10);
}
}
const diagnostic: DiagnosticInfo = {
url: church.website!,
churchName: church.name,
success: result.success,
schedulesFound: result.schedules.length,
hasFrenchDays,
hasTimePatterns,
timePatternsSample,
textSample,
error: result.error,
};
diagnostics.push(diagnostic);
if (result.success && result.schedules.length > 0) {
successCount++;
console.log(`✅ SUCCESS - ${result.schedules.length} schedules`);
} else {
failCount++;
console.log(`❌ FAILED - ${result.error}`);
if (hasFrenchDays && !hasTimePatterns) {
console.log(` 💡 Has French days but no times`);
} else if (!hasFrenchDays && hasTimePatterns) {
console.log(` 💡 Has times but no French days`);
} else if (hasFrenchDays && hasTimePatterns) {
console.log(` 💡 Has BOTH days and times - parsing issue!`);
console.log(` Sample times: ${timePatternsSample.slice(0, 5).join(', ')}`);
} else {
console.log(` 💡 No mass schedule content found`);
}
}
console.log('');
} catch (err: any) {
failCount++;
console.log(`❌ ERROR - ${err.message}\n`);
diagnostics.push({
url: church.website!,
churchName: church.name,
success: false,
schedulesFound: 0,
hasFrenchDays: false,
hasTimePatterns: false,
timePatternsSample: [],
textSample: '',
error: err.message,
});
}
}
await scraper.close();
// Analysis
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
console.log('');
const hasBoth = diagnostics.filter(d => !d.success && d.hasFrenchDays && d.hasTimePatterns);
const hasDaysNoTimes = diagnostics.filter(d => !d.success && d.hasFrenchDays && !d.hasTimePatterns);
const hasTimesNoDays = diagnostics.filter(d => !d.success && !d.hasFrenchDays && d.hasTimePatterns);
const hasNeither = diagnostics.filter(d => !d.success && !d.hasFrenchDays && !d.hasTimePatterns);
console.log('FAILURE ANALYSIS:');
console.log(` Has days + times but failed: ${hasBoth.length} (PARSING BUG)`);
console.log(` Has days but no times: ${hasDaysNoTimes.length}`);
console.log(` Has times but no days: ${hasTimesNoDays.length}`);
console.log(` Has neither: ${hasNeither.length} (no mass schedule on page)`);
console.log('');
if (hasBoth.length > 0) {
console.log('⚠️ PARSING BUGS TO FIX (has both days and times but failed):');
hasBoth.forEach(d => {
console.log(` ${d.churchName}`);
console.log(` URL: ${d.url}`);
console.log(` Sample times found: ${d.timePatternsSample.slice(0, 5).join(', ')}`);
console.log(` Text sample: ${d.textSample.substring(0, 150)}...`);
console.log('');
});
}
await prisma.$disconnect();
await pool.end();
}
testFrenchBroader().catch(console.error);

View File

@@ -0,0 +1,100 @@
#!/usr/bin/env tsx
/**
* Test international scraper against French churches
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
async function testFrenchScraper() {
console.log('Testing French church mass schedule scraping...\n');
// Get French churches with websites
const churches = await prisma.church.findMany({
where: {
country: 'FR',
website: { not: null },
source: 'osm',
},
take: 5,
orderBy: { createdAt: 'asc' },
});
if (churches.length === 0) {
console.log('No French churches with websites found.');
await prisma.$disconnect();
await pool.end();
return;
}
console.log(`Found ${churches.length} French churches to test:\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('FR');
let successCount = 0;
let failCount = 0;
for (const church of churches) {
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
console.log(`Church: ${church.name}`);
console.log(`City: ${church.city || 'Unknown'}`);
console.log(`URL: ${church.website}`);
console.log('');
try {
const result = await scraper.scrape(church.website!);
if (result.success && result.schedules.length > 0) {
successCount++;
console.log(`✅ SUCCESS - Found ${result.schedules.length} schedules\n`);
// Group by day and show
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Dimanche', 'Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi'];
Object.entries(byDay).forEach(([day, scheds]) => {
console.log(` ${dayNames[parseInt(day)]}:`);
scheds.forEach(s => {
console.log(` ${s.time} - ${s.language || 'Unknown'} (${s.massType || 'Mass'})`);
});
});
console.log('');
} else {
failCount++;
console.log(`❌ FAILED - ${result.error}`);
console.log('');
}
} catch (err: any) {
failCount++;
console.log(`❌ ERROR - ${err.message}`);
console.log('');
}
}
await scraper.close();
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
console.log(`Success: ${successCount}, Failed: ${failCount}\n`);
await prisma.$disconnect();
await pool.end();
}
testFrenchScraper().catch(console.error);

View File

@@ -0,0 +1,210 @@
#!/usr/bin/env tsx
/**
* Test scraper on a diverse sample of international churches
* to identify edge cases across different languages and formats
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
interface TestChurch {
name: string;
url: string;
country: string;
language: string;
expectedDays?: string; // e.g., "Sun-Sat" or "Sun, Wed, Sat"
notes?: string;
}
// Sample churches from different countries/languages
const testChurches: TestChurch[] = [
// FRENCH
{
name: 'Saint-Étienne du Mont, Paris',
url: 'https://www.saintetiennedumontparis.fr/',
country: 'FR',
language: 'French',
notes: 'French format with "du lundi au vendredi"',
},
{
name: 'Notre-Dame de la Garde, Marseille',
url: 'https://www.notredamedelagarde.fr/',
country: 'FR',
language: 'French',
notes: 'Major pilgrimage site',
},
// GERMAN
{
name: 'St. Peter, Munich',
url: 'https://www.alterpeter.de/',
country: 'DE',
language: 'German',
notes: 'German format with "bis" for ranges',
},
{
name: 'Kölner Dom, Cologne',
url: 'https://www.koelner-dom.de/',
country: 'DE',
language: 'German',
notes: 'Cathedral with Uhr time format',
},
// SPANISH
{
name: 'Sagrada Família, Barcelona',
url: 'https://sagradafamilia.org/',
country: 'ES',
language: 'Spanish',
notes: 'Major tourist site, may have complex schedule',
},
{
name: 'Parroquia San Miguel, Madrid',
url: 'https://www.parroquiasanmiguel.es/',
country: 'ES',
language: 'Spanish',
notes: 'Spanish format with "de lunes a viernes"',
},
// PORTUGUESE
{
name: 'Basílica da Estrela, Lisbon',
url: 'https://www.basilicadaestrela.com/',
country: 'PT',
language: 'Portuguese',
notes: 'Portuguese format',
},
// ITALIAN
{
name: 'Santa Maria Maggiore, Rome',
url: 'https://www.vatican.va/various/basiliche/sm_maggiore/index_it.htm',
country: 'IT',
language: 'Italian',
notes: 'Major basilica',
},
{
name: 'Duomo di Milano',
url: 'https://www.duomomilano.it/',
country: 'IT',
language: 'Italian',
notes: 'Cathedral with Italian format',
},
// DUTCH
{
name: 'Basiliek van de H. Nicolaas, Amsterdam',
url: 'https://www.nicolaas-parochie.nl/',
country: 'NL',
language: 'Dutch',
notes: 'Dutch format with "tot" for ranges',
},
// CZECH
{
name: 'Chrám sv. Víta, Prague',
url: 'https://www.katedralasvatehovita.cz/',
country: 'CZ',
language: 'Czech',
notes: 'Czech format',
},
// HUNGARIAN
{
name: 'Szent István Bazilika, Budapest',
url: 'https://www.bazilika.biz/',
country: 'HU',
language: 'Hungarian',
notes: 'Hungarian format',
},
// More complex cases
{
name: 'Cathédrale Notre-Dame, Strasbourg',
url: 'https://www.cathedrale-strasbourg.fr/',
country: 'FR',
language: 'French',
notes: 'Bilingual region (French/German)',
},
];
async function testChurch(church: TestChurch, scraper: GenericScraper): Promise<void> {
console.log(`\n${'='.repeat(80)}`);
console.log(`📍 ${church.name}`);
console.log(` ${church.url}`);
console.log(` Language: ${church.language} | Country: ${church.country}`);
if (church.notes) console.log(` Notes: ${church.notes}`);
console.log(`${'='.repeat(80)}`);
try {
scraper.setCountry(church.country);
const result = await scraper.scrape(church.url);
if (!result.success) {
console.log(`❌ FAILED: ${result.error || 'Unknown error'}`);
return;
}
if (result.schedules.length === 0) {
console.log(`⚠️ SUCCESS but NO SCHEDULES found`);
return;
}
// Group by day
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
console.log(`\n✅ Found ${result.schedules.length} schedules:\n`);
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
const times = byDay[i].map(s => {
let str = s.time;
if (s.massType) str += ` (${s.massType})`;
if (s.language && s.language !== 'English') str += ` [${s.language}]`;
return str;
}).join(', ');
console.log(` ${dayNames[i]}: ${times}`);
}
}
} catch (error) {
console.log(`❌ ERROR: ${error instanceof Error ? error.message : String(error)}`);
}
}
async function main() {
const scraper = new GenericScraper();
await scraper.init();
console.log('🌍 INTERNATIONAL CHURCH SCRAPER TEST');
console.log(`Testing ${testChurches.length} churches across ${new Set(testChurches.map(c => c.country)).size} countries`);
const results: { success: number; failed: number; noSchedules: number } = {
success: 0,
failed: 0,
noSchedules: 0,
};
for (const church of testChurches) {
await testChurch(church, scraper);
// Brief delay between requests to be respectful
await new Promise(resolve => setTimeout(resolve, 2000));
}
await scraper.close();
console.log(`\n${'='.repeat(80)}`);
console.log('📊 SUMMARY');
console.log(`${'='.repeat(80)}`);
console.log(`Total tested: ${testChurches.length}`);
console.log(`✅ Success with schedules: ${results.success}`);
console.log(`⚠️ Success but no schedules: ${results.noSchedules}`);
console.log(`❌ Failed: ${results.failed}`);
}
main().catch(console.error);

View File

@@ -0,0 +1,36 @@
/**
* Quick test script to verify the masstimes.org JSON API scraper works
* Usage: npx tsx scripts/test-masstimes-api.ts
*/
import { writeFileSync } from 'fs';
import { MassTimesScraper } from '../../src/lib/masstimes-scraper';
async function main() {
console.log('Testing MassTimes.org JSON API Scraper\n');
const scraper = new MassTimesScraper();
try {
await scraper.init();
console.log('Browser initialized\n');
const lat = 34.852;
const lng = -82.394;
console.log(`Fetching churches near Greenville, SC (${lat}, ${lng})...\n`);
const churches = await scraper.scrapeByLocation(lat, lng);
const outPath = 'scraped-churches.json';
writeFileSync(outPath, JSON.stringify(churches, null, 2));
console.log(`\nSaved ${churches.length} churches to ${outPath}`);
} catch (error) {
console.error('TEST FAILED:', error);
process.exit(1);
} finally {
await scraper.close();
}
}
main();

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env tsx
/**
* Test which sections are being created for Polish church
*/
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
// Exact text from the page
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
console.log('Text:');
console.log(text);
console.log('\n');
const dayConfigs = getDayNamesForCountry('PL');
const dayPatterns = buildDayPatterns(dayConfigs);
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
console.log('=== Testing individual day matching ===\n');
// Test niedziela specifically
const niedziela = 'niedziela';
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const match = text.match(regex);
if (match) {
console.log(`✓ niedziela matched!`);
console.log(` Full match: "${match[0].substring(0, 100)}"`);
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
console.log('');
// Test if times can be extracted from captured text
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
const times = match[1].match(spacePattern);
console.log(` Times in captured text: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ niedziela NOT matched`);
console.log('');
// Try simpler regex
const simpleRegex = /niedziela[:\s]+(.{0,100})/i;
const simpleMatch = text.match(simpleRegex);
if (simpleMatch) {
console.log(`Simple regex matched: "${simpleMatch[1]}"`);
}
}
// Test poniedziałek
console.log('\n=== Testing poniedziałek ===\n');
const ponieRegex = new RegExp(
`(?:^|\\s|[,;:])poniedziałek[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
'i'
);
const ponieMatch = text.match(ponieRegex);
if (ponieMatch) {
console.log(`✓ poniedziałek matched!`);
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
console.log(` Times: ${times ? times.join(', ') : 'none'}`);
} else {
console.log(`✗ poniedziałek NOT matched`);
}

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env tsx
/**
* Test Polish church with detailed section logging
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
// Temporarily modify GenericScraper to add logging
const originalParse = GenericScraper.prototype['parseSchedules'];
GenericScraper.prototype['parseSchedules'] = function(html: string) {
const text = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Call findScheduleSections and log result
const sections = this['findScheduleSections'](text);
console.log('\n=== Sections found by findScheduleSections() ===\n');
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
sections.forEach((section: any, i: number) => {
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
console.log(` Text: "${section.text.substring(0, 80)}..."`);
});
console.log(`\nTotal sections: ${sections.length}\n`);
// Continue with normal processing
return originalParse.call(this, html);
};
async function testPolish() {
const url = 'http://parafialubojna.pl';
console.log(`Testing: ${url}`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('PL');
const result = await scraper.scrape(url);
console.log(`\nFinal result: ${result.success}`);
console.log(`Schedules: ${result.schedules.length}\n`);
if (result.schedules.length > 0) {
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNamesPL = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
console.log('Parsed schedules by day:');
for (let i = 0; i < 7; i++) {
if (byDay[i]) {
console.log(` ${dayNamesPL[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
}
}
}
await scraper.close();
}
testPolish().catch(console.error);

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env tsx
/**
* Test which pattern is matching "00" time
*/
// Test text from German church
const testText = "10:00 uhr lateinisches amt";
const timePatterns = [
{ name: '12-hour AM/PM', pattern: /(\d{1,2}):(\d{2})\s*(AM|PM|am|pm|a\.m\.|p\.m\.)/g },
{ name: '12-hour no minutes', pattern: /(?<![:\d])(\d{1,2})\s*(AM|PM|am|pm|a\.m\.|p\.m\.)/g },
{ name: '24-hour colon', pattern: /(?<![:\d\w])(\d{1,2}):(\d{2})(?!\s*(AM|PM|am|pm))/g },
{ name: 'French/Portuguese h', pattern: /(?<![:\d\w])(\d{1,2})\s*h\s*(\d{2})?(?!\w)/gi },
{ name: 'Italian period', pattern: /(?<![:\d\w])(\d{1,2})\.(\d{2})(?=\s|$|,|;|\)|\])/g },
{ name: 'German Uhr (old)', pattern: /(\d{1,2})[:\.]?(\d{2})?\s*Uhr/gi },
{ name: 'German Uhr (fixed)', pattern: /(?<![:\d])(\d{1,2})[:\.]?(\d{2})?\s*Uhr/gi },
{ name: 'Polish space', pattern: /\b(\d{1,2})\s+(\d{2})(?!\d)/g },
];
console.log(`Test text: "${testText}"\n`);
for (const { name, pattern } of timePatterns) {
const matches = [...testText.matchAll(pattern)];
if (matches.length > 0) {
console.log(`${name}:`);
for (const match of matches) {
console.log(` Matched: "${match[0]}" at index ${match.index}`);
}
} else {
console.log(`${name}: no match`);
}
}
// Now test with just "00 uhr"
console.log(`\n${'='.repeat(60)}\n`);
const testText2 = "00 uhr lateinisches";
console.log(`Test text: "${testText2}"\n`);
for (const { name, pattern } of timePatterns) {
const matches = [...testText2.matchAll(pattern)];
if (matches.length > 0) {
console.log(`${name}:`);
for (const match of matches) {
console.log(` Matched: "${match[0]}" at index ${match.index}`);
}
} else {
console.log(`${name}: no match`);
}
}

View File

@@ -0,0 +1,193 @@
#!/usr/bin/env tsx
/**
* Quick test of top 5 priority countries
*/
import { config } from 'dotenv';
config({ path: '.env.local' });
config({ path: '.env' });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const COUNTRIES = [
{ code: 'FR', name: 'France' },
{ code: 'DE', name: 'Germany' },
{ code: 'ES', name: 'Spain' },
{ code: 'PL', name: 'Poland' },
{ code: 'BR', name: 'Brazil' },
];
const PER_COUNTRY = 10;
interface CountryResult {
country: string;
countryName: string;
tested: number;
success: number;
failed: number;
successRate: number;
hasBothButFailed: number; // Has days + times but parsing failed
totalSchedules: number;
sampleSuccess?: string;
}
async function testTop5() {
console.log('Testing top 5 priority countries (10 churches each)...\n');
const scraper = new GenericScraper();
await scraper.init();
const results: CountryResult[] = [];
for (const country of COUNTRIES) {
console.log(`\n${'='.repeat(60)}`);
console.log(`Testing ${country.name} (${country.code})`);
console.log('='.repeat(60));
const churches = await prisma.church.findMany({
where: {
country: country.code,
website: { not: null },
source: 'osm',
},
take: PER_COUNTRY,
orderBy: { createdAt: 'asc' },
});
if (churches.length === 0) {
console.log(`No churches with websites found for ${country.name}\n`);
continue;
}
scraper.setCountry(country.code);
let success = 0;
let failed = 0;
let hasBothButFailed = 0;
let totalSchedules = 0;
let sampleSuccess: string | undefined;
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
try {
const result = await scraper.scrape(church.website!);
if (result.success && result.schedules.length > 0) {
success++;
totalSchedules += result.schedules.length;
process.stdout.write(`${result.schedules.length} schedules\n`);
if (!sampleSuccess && result.schedules.length > 0) {
sampleSuccess = `${church.name}: ${result.schedules.length} schedules`;
}
} else {
failed++;
process.stdout.write(`${result.error}\n`);
// Check if has both days and times (parsing bug indicator)
if (result.rawHtml) {
const text = result.rawHtml
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase();
// Check for day names in any language
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|terça|terca|quarta|quinta|sexta)\b/i);
const hasTimes = text.match(/\d{1,2}[h:\.]\s*\d{0,2}/);
if (hasDays && hasTimes) {
hasBothButFailed++;
process.stdout.write(` ⚠️ Has days + times but failed to parse\n`);
}
}
}
} catch (err: any) {
failed++;
process.stdout.write(`❌ ERROR: ${err.message}\n`);
}
}
const successRate = churches.length > 0 ? (success / churches.length) * 100 : 0;
results.push({
country: country.code,
countryName: country.name,
tested: churches.length,
success,
failed,
successRate,
hasBothButFailed,
totalSchedules,
sampleSuccess,
});
console.log(`\n${country.name} Summary: ${success}/${churches.length} (${successRate.toFixed(0)}%)`);
console.log(` Total schedules extracted: ${totalSchedules}`);
if (hasBothButFailed > 0) {
console.log(` ⚠️ Parsing bugs: ${hasBothButFailed} (has content but failed to parse)`);
}
}
await scraper.close();
// Final summary
console.log('\n\n');
console.log('═'.repeat(80));
console.log('FINAL RESULTS - TOP 5 COUNTRIES');
console.log('═'.repeat(80));
console.log('');
console.log('Country | Tested | Success | Rate | Schedules | Bugs');
console.log('─'.repeat(80));
const totalTested = results.reduce((sum, r) => sum + r.tested, 0);
const totalSuccess = results.reduce((sum, r) => sum + r.success, 0);
const totalSchedules = results.reduce((sum, r) => sum + r.totalSchedules, 0);
const totalBugs = results.reduce((sum, r) => sum + r.hasBothButFailed, 0);
results.forEach(r => {
const country = r.countryName.padEnd(12);
const tested = String(r.tested).padStart(6);
const success = String(r.success).padStart(7);
const rate = `${r.successRate.toFixed(0)}%`.padStart(5);
const schedules = String(r.totalSchedules).padStart(9);
const bugs = r.hasBothButFailed > 0 ? `⚠️ ${r.hasBothButFailed}` : '✓';
console.log(`${country} | ${tested} | ${success} | ${rate} | ${schedules} | ${bugs}`);
});
console.log('─'.repeat(80));
const avgRate = totalTested > 0 ? (totalSuccess / totalTested) * 100 : 0;
console.log(`OVERALL | ${String(totalTested).padStart(6)} | ${String(totalSuccess).padStart(7)} | ${avgRate.toFixed(0).padStart(4)}% | ${String(totalSchedules).padStart(9)} | ${totalBugs > 0 ? `⚠️ ${totalBugs}` : '✓'}`);
console.log('');
console.log('═'.repeat(80));
console.log('');
if (totalBugs > 0) {
console.log(`⚠️ ${totalBugs} parsing bugs detected (has days + times but failed)`);
console.log(' These need investigation and fixes.\n');
} else {
console.log('✅ No parsing bugs! All failures are legitimate (no content or wrong page).\n');
}
console.log(`Total churches tested: ${totalTested}`);
console.log(`Total successful: ${totalSuccess} (${avgRate.toFixed(1)}%)`);
console.log(`Total mass schedules extracted: ${totalSchedules}`);
console.log('');
await prisma.$disconnect();
await pool.end();
}
testTop5().catch(console.error);

View File

@@ -0,0 +1,173 @@
#!/usr/bin/env tsx
/**
* Test website scraper on churches with websites
* Analyzes which websites can be scraped successfully
*/
// Load .env
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import { GenericScraper } from '../../src/scrapers/strategies/generic';
import fs from 'fs';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
interface TestResult {
churchId: string;
name: string;
website: string;
country: string;
success: boolean;
massesFound: number;
schedules?: { dayOfWeek: number; time: string; massType?: string; language?: string }[];
error?: string;
}
function normalizeUrl(url: string): string {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
return `https://${url}`;
}
return url;
}
async function testScrapers(limit: number = 50, country?: string) {
const results: TestResult[] = [];
// Get churches with websites
const whereClause: any = {
website: { not: null },
};
if (country) {
whereClause.country = country;
}
const churches = await prisma.church.findMany({
where: whereClause,
take: limit,
orderBy: { createdAt: 'desc' },
});
console.log(`Testing ${churches.length} churches with websites...\n`);
// Initialize the scraper (launches Playwright browser)
const scraper = new GenericScraper();
await scraper.init();
try {
for (let i = 0; i < churches.length; i++) {
const church = churches[i];
const url = normalizeUrl(church.website!);
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name}`);
console.log(` Website: ${url}`);
try {
const result = await scraper.scrape(url);
results.push({
churchId: church.id,
name: church.name,
website: url,
country: church.country,
success: result.success,
massesFound: result.schedules.length,
schedules: result.schedules.map((s) => ({
dayOfWeek: s.dayOfWeek,
time: s.time,
massType: s.massType,
language: s.language,
})),
error: result.error,
});
if (result.success) {
console.log(`${result.schedules.length} masses found`);
for (const s of result.schedules) {
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
console.log(` ${days[s.dayOfWeek]} ${s.time} (${s.language || 'English'}${s.massType ? ', ' + s.massType : ''})`);
}
} else {
console.log(` ✗ No masses found: ${result.error}`);
}
} catch (error: any) {
console.log(` ✗ Error: ${error.message}`);
results.push({
churchId: church.id,
name: church.name,
website: url,
country: church.country,
success: false,
massesFound: 0,
error: error.message,
});
}
console.log('');
}
} finally {
// Always close the browser
await scraper.close();
}
// Summary
const successful = results.filter((r) => r.success);
const failed = results.filter((r) => !r.success);
const totalMasses = results.reduce((sum, r) => sum + r.massesFound, 0);
console.log('============================================================');
console.log('Test Summary');
console.log('============================================================');
console.log(`Total churches tested: ${results.length}`);
console.log(`Successful scrapes: ${successful.length} (${((successful.length / results.length) * 100).toFixed(1)}%)`);
console.log(`Failed scrapes: ${failed.length} (${((failed.length / results.length) * 100).toFixed(1)}%)`);
console.log(`Total masses found: ${totalMasses}`);
console.log('============================================================');
if (failed.length > 0) {
console.log('\nFailed websites:');
for (const f of failed) {
console.log(` - ${f.name}: ${f.website} (${f.error})`);
}
}
console.log('');
// Export results (without raw HTML to keep file manageable)
fs.writeFileSync(
'scraper-test-results.json',
JSON.stringify(results, null, 2)
);
console.log('Results saved to scraper-test-results.json');
return results;
}
async function main() {
const args = process.argv.slice(2);
const limitIndex = args.indexOf('--limit');
const countryIndex = args.indexOf('--country');
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 50;
const country = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
console.log('============================================================');
console.log('Website Scraper Testing');
console.log('============================================================');
console.log(`Limit: ${limit}`);
console.log(`Country: ${country || 'All'}`);
console.log('============================================================\n');
await testScrapers(limit, country);
await prisma.$disconnect();
await pool.end();
}
main().catch(console.error);

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env tsx
/**
* Verify Paróquia da Paz schedules are correctly parsed
*/
import { GenericScraper } from '../../src/scrapers/strategies/generic';
async function verifyPazSchedules() {
const url = 'https://www.paroquiadapaz.org.br/';
console.log(`Verifying: ${url}\n`);
const scraper = new GenericScraper();
await scraper.init();
scraper.setCountry('BR');
const result = await scraper.scrape(url);
console.log(`✅ Success: ${result.success}`);
console.log(`📅 Schedules found: ${result.schedules.length}\n`);
// Group by day
const byDay: Record<number, typeof result.schedules> = {};
for (const sched of result.schedules) {
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
byDay[sched.dayOfWeek].push(sched);
}
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
console.log('═══════════════════════════════════════════════');
console.log('PARSED SCHEDULE:');
console.log('═══════════════════════════════════════════════\n');
Object.entries(byDay)
.sort(([a], [b]) => parseInt(a) - parseInt(b))
.forEach(([day, scheds]) => {
console.log(`${dayNames[parseInt(day)]}:`);
scheds.forEach(s => {
console.log(` ${s.time} - ${s.language} ${s.massType}`);
});
console.log('');
});
console.log('Expected schedule (from website):');
console.log('Segunda, Terça, Quarta, Sexta: 16:00 e 18:00');
console.log('Quinta: 16:00 e 19:00');
console.log('Sábado: 08:00, 16:00 e 18:00');
console.log('Domingo: 08:00, 11:00, 16:00, 18:00 e 20:00');
await scraper.close();
}
verifyPazSchedules().catch(console.error);