chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
165
scripts/debug/analyze-enrichment-priority.ts
Normal file
165
scripts/debug/analyze-enrichment-priority.ts
Normal file
@@ -0,0 +1,165 @@
|
||||
import { config } from 'dotenv';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
|
||||
// Load .env.local first, then .env
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
throw new Error('DATABASE_URL environment variable is not set');
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
interface CountryStats {
|
||||
country: string;
|
||||
totalChurches: number;
|
||||
withWebsite: number;
|
||||
withoutWebsite: number;
|
||||
websitePercent: number;
|
||||
needEnrichment: number;
|
||||
priority: number;
|
||||
}
|
||||
|
||||
async function analyzeEnrichmentPriority() {
|
||||
try {
|
||||
console.log('Analyzing enrichment priority by country...\n');
|
||||
|
||||
// Get all OSM churches grouped by country
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
source: 'osm',
|
||||
},
|
||||
select: {
|
||||
country: true,
|
||||
hasWebsite: true,
|
||||
website: true,
|
||||
},
|
||||
});
|
||||
|
||||
// Group by country and calculate stats
|
||||
const byCountry = churches.reduce((acc, church) => {
|
||||
const country = church.country || 'Unknown';
|
||||
if (!acc[country]) {
|
||||
acc[country] = {
|
||||
country,
|
||||
totalChurches: 0,
|
||||
withWebsite: 0,
|
||||
withoutWebsite: 0,
|
||||
websitePercent: 0,
|
||||
needEnrichment: 0,
|
||||
priority: 0,
|
||||
};
|
||||
}
|
||||
|
||||
acc[country].totalChurches++;
|
||||
if (church.hasWebsite || church.website) {
|
||||
acc[country].withWebsite++;
|
||||
} else {
|
||||
acc[country].withoutWebsite++;
|
||||
acc[country].needEnrichment++;
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, {} as Record<string, CountryStats>);
|
||||
|
||||
// Calculate percentages and priority score
|
||||
const stats = Object.values(byCountry).map((stat) => {
|
||||
stat.websitePercent = (stat.withWebsite / stat.totalChurches) * 100;
|
||||
|
||||
// Priority formula:
|
||||
// - Weight heavily on churches needing enrichment (80%)
|
||||
// - Weight on low website coverage (20%)
|
||||
// This favors large countries with low coverage
|
||||
const needWeight = stat.needEnrichment / 1000; // Normalize to thousands
|
||||
const coverageGap = 100 - stat.websitePercent; // How much coverage is missing
|
||||
stat.priority = needWeight * 0.8 + (coverageGap / 100) * needWeight * 0.2;
|
||||
|
||||
return stat;
|
||||
});
|
||||
|
||||
// Sort by priority (highest first)
|
||||
stats.sort((a, b) => b.priority - a.priority);
|
||||
|
||||
// Display results
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('ENRICHMENT PRIORITY RANKING');
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('');
|
||||
console.log('Priority formula: (churches_needing_enrichment * 0.8) + (coverage_gap * 0.2)');
|
||||
console.log('This favors countries with many churches and low website coverage.');
|
||||
console.log('');
|
||||
console.log('Rank | Country | Total | Need Enrichment | Coverage | Priority Score');
|
||||
console.log('─────┼─────────┼───────┼────────────────┼──────────┼────────────────');
|
||||
|
||||
stats.forEach((stat, index) => {
|
||||
const rank = String(index + 1).padStart(4);
|
||||
const country = stat.country.padEnd(7);
|
||||
const total = String(stat.totalChurches).padStart(5);
|
||||
const need = String(stat.needEnrichment).padStart(15);
|
||||
const coverage = `${stat.websitePercent.toFixed(1)}%`.padStart(8);
|
||||
const priority = stat.priority.toFixed(2).padStart(14);
|
||||
|
||||
console.log(`${rank} | ${country} | ${total} | ${need} | ${coverage} | ${priority}`);
|
||||
});
|
||||
|
||||
console.log('');
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('');
|
||||
|
||||
// Show top 10 with details
|
||||
console.log('TOP 10 COUNTRIES TO PRIORITIZE:');
|
||||
console.log('');
|
||||
|
||||
stats.slice(0, 10).forEach((stat, index) => {
|
||||
console.log(`${index + 1}. ${stat.country}`);
|
||||
console.log(` Total churches: ${stat.totalChurches.toLocaleString()}`);
|
||||
console.log(` Need enrichment: ${stat.needEnrichment.toLocaleString()} (${(100 - stat.websitePercent).toFixed(1)}% missing)`);
|
||||
console.log(` Current coverage: ${stat.websitePercent.toFixed(1)}%`);
|
||||
console.log(` Priority score: ${stat.priority.toFixed(2)}`);
|
||||
console.log('');
|
||||
});
|
||||
|
||||
// Calculate enrichment timeline
|
||||
const totalNeedEnrichment = stats.reduce((sum, s) => sum + s.needEnrichment, 0);
|
||||
const daysAtFullSpeed = Math.ceil(totalNeedEnrichment / 390);
|
||||
const monthsAtFullSpeed = (daysAtFullSpeed / 30).toFixed(1);
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('ENRICHMENT TIMELINE');
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log(`Total churches needing enrichment: ${totalNeedEnrichment.toLocaleString()}`);
|
||||
console.log(`At 390 churches/day (free tier): ${daysAtFullSpeed} days (~${monthsAtFullSpeed} months)`);
|
||||
console.log('');
|
||||
|
||||
// Output country priority order for the script
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('COUNTRY PRIORITY ORDER (for enrichment script)');
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('');
|
||||
console.log('const COUNTRY_PRIORITY = [');
|
||||
stats
|
||||
.filter((s) => s.needEnrichment > 0)
|
||||
.forEach((stat, index) => {
|
||||
const comma = index < stats.filter((s) => s.needEnrichment > 0).length - 1 ? ',' : '';
|
||||
console.log(` '${stat.country}'${comma} // ${stat.needEnrichment.toLocaleString()} churches`);
|
||||
});
|
||||
console.log('];');
|
||||
console.log('');
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
analyzeEnrichmentPriority();
|
||||
66
scripts/debug/check-2-real-bugs.ts
Normal file
66
scripts/debug/check-2-real-bugs.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Check the 2 potentially real bugs
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function checkRealBugs() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
console.log('=== 1. Iglesia de San Fernando (trying Spanish page) ===\n');
|
||||
|
||||
scraper.setCountry('ES');
|
||||
const spanishUrl = 'https://www.parroquiasanfernandomaspalomas.net/'; // Remove /de/
|
||||
const result1 = await scraper.scrape(spanishUrl);
|
||||
|
||||
console.log(`URL: ${spanishUrl}`);
|
||||
console.log(`Success: ${result1.success}`);
|
||||
console.log(`Schedules: ${result1.schedules.length}`);
|
||||
console.log(`Error: ${result1.error || 'none'}\n`);
|
||||
|
||||
if (result1.schedules.length > 0) {
|
||||
console.log('Sample schedules:');
|
||||
result1.schedules.slice(0, 5).forEach(s => {
|
||||
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||||
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
|
||||
});
|
||||
}
|
||||
|
||||
console.log('\n=== 2. Kościół (Poland) ===\n');
|
||||
|
||||
scraper.setCountry('PL');
|
||||
const result2 = await scraper.scrape('http://parafialubojna.pl');
|
||||
|
||||
console.log(`Success: ${result2.success}`);
|
||||
console.log(`Schedules: ${result2.schedules.length}`);
|
||||
console.log(`Error: ${result2.error || 'none'}\n`);
|
||||
|
||||
if (result2.schedules.length > 0) {
|
||||
console.log('Sample schedules:');
|
||||
result2.schedules.slice(0, 5).forEach(s => {
|
||||
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||||
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
|
||||
});
|
||||
} else if (result2.rawHtml) {
|
||||
const text = result2.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Look for Polish schedule keywords
|
||||
const scheduleIndex = text.indexOf('msze') || text.indexOf('msza') || text.indexOf('nabożeńst');
|
||||
if (scheduleIndex !== -1) {
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
|
||||
console.log('Found schedule section:');
|
||||
console.log(snippet);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
checkRealBugs().catch(console.error);
|
||||
79
scripts/debug/check-enrichment-detail.ts
Normal file
79
scripts/debug/check-enrichment-detail.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
import { Pool } from 'pg';
|
||||
import * as dotenv from 'dotenv';
|
||||
import * as path from 'path';
|
||||
|
||||
// Load .env.local first (takes precedence), then .env
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
async function checkEnrichmentDetail() {
|
||||
try {
|
||||
console.log('Connecting to database...\n');
|
||||
|
||||
// Check churches awaiting enrichment
|
||||
const pendingResult = await pool.query(`
|
||||
SELECT
|
||||
country,
|
||||
COUNT(*) as pending_count
|
||||
FROM churches
|
||||
WHERE google_place_id IS NULL
|
||||
GROUP BY country
|
||||
ORDER BY pending_count DESC
|
||||
LIMIT 20;
|
||||
`);
|
||||
|
||||
console.log('=== Churches Awaiting Enrichment (Top 20 Countries) ===');
|
||||
let totalPending = 0;
|
||||
pendingResult.rows.forEach((row) => {
|
||||
console.log(`${row.country}: ${row.pending_count} churches`);
|
||||
totalPending += parseInt(row.pending_count);
|
||||
});
|
||||
console.log(`\nTotal pending shown: ${totalPending}`);
|
||||
|
||||
// Check total stats
|
||||
const statsResult = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total_churches,
|
||||
COUNT(CASE WHEN google_place_id IS NOT NULL THEN 1 END) as enriched,
|
||||
COUNT(CASE WHEN google_place_id IS NULL THEN 1 END) as pending
|
||||
FROM churches;
|
||||
`);
|
||||
|
||||
console.log('\n=== Overall Stats ===');
|
||||
console.log(`Total churches: ${statsResult.rows[0].total_churches}`);
|
||||
console.log(`Enriched: ${statsResult.rows[0].enriched} (${((statsResult.rows[0].enriched / statsResult.rows[0].total_churches) * 100).toFixed(2)}%)`);
|
||||
console.log(`Pending: ${statsResult.rows[0].pending} (${((statsResult.rows[0].pending / statsResult.rows[0].total_churches) * 100).toFixed(2)}%)`);
|
||||
|
||||
// Check enrichment rate
|
||||
const rateResult = await pool.query(`
|
||||
SELECT
|
||||
DATE(updated_at) as date,
|
||||
COUNT(*) as enriched_count
|
||||
FROM churches
|
||||
WHERE google_place_id IS NOT NULL
|
||||
AND updated_at > NOW() - INTERVAL '7 days'
|
||||
GROUP BY DATE(updated_at)
|
||||
ORDER BY date DESC;
|
||||
`);
|
||||
|
||||
console.log('\n=== Enrichment Activity (Last 7 Days) ===');
|
||||
if (rateResult.rows.length === 0) {
|
||||
console.log('No enrichment activity in the last 7 days');
|
||||
} else {
|
||||
rateResult.rows.forEach((row) => {
|
||||
console.log(`${row.date}: ${row.enriched_count} churches`);
|
||||
});
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error checking enrichment detail:', error);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkEnrichmentDetail();
|
||||
146
scripts/debug/check-enrichment-status.ts
Normal file
146
scripts/debug/check-enrichment-status.ts
Normal file
@@ -0,0 +1,146 @@
|
||||
import { config } from 'dotenv';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
|
||||
// Load .env.local first, then .env
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
throw new Error('DATABASE_URL environment variable is not set');
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function checkEnrichmentStatus() {
|
||||
try {
|
||||
console.log('Checking enrichment status...\n');
|
||||
|
||||
// Overall stats
|
||||
const totalOSM = await prisma.church.count({
|
||||
where: { source: 'osm' },
|
||||
});
|
||||
|
||||
const enriched = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
googlePlaceId: { not: null },
|
||||
},
|
||||
});
|
||||
|
||||
const withWebsite = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
hasWebsite: true,
|
||||
},
|
||||
});
|
||||
|
||||
const needEnrichment = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
hasWebsite: false,
|
||||
website: null,
|
||||
},
|
||||
});
|
||||
|
||||
// Recently enriched (last 24 hours)
|
||||
const yesterday = new Date();
|
||||
yesterday.setDate(yesterday.getDate() - 1);
|
||||
|
||||
const recentlyEnriched = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
googlePlaceId: { not: null },
|
||||
updatedAt: { gte: yesterday },
|
||||
},
|
||||
});
|
||||
|
||||
// Get top 10 priority countries status
|
||||
const PRIORITY_COUNTRIES = ['FR', 'DE', 'ES', 'PL', 'BR', 'PT', 'PH', 'CZ', 'MX', 'HU'];
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('OVERALL ENRICHMENT STATUS');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(`Total OSM churches: ${totalOSM.toLocaleString()}`);
|
||||
console.log(`Churches with Google Place ID: ${enriched.toLocaleString()} (${((enriched / totalOSM) * 100).toFixed(2)}%)`);
|
||||
console.log(`Churches with websites: ${withWebsite.toLocaleString()} (${((withWebsite / totalOSM) * 100).toFixed(2)}%)`);
|
||||
console.log(`Need enrichment: ${needEnrichment.toLocaleString()} (${((needEnrichment / totalOSM) * 100).toFixed(2)}%)`);
|
||||
console.log('');
|
||||
console.log(`Recently enriched (24h): ${recentlyEnriched.toLocaleString()}`);
|
||||
console.log('');
|
||||
|
||||
// Priority countries breakdown
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('TOP 10 PRIORITY COUNTRIES STATUS');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('');
|
||||
|
||||
for (const country of PRIORITY_COUNTRIES) {
|
||||
const total = await prisma.church.count({
|
||||
where: { source: 'osm', country },
|
||||
});
|
||||
|
||||
const countryEnriched = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
country,
|
||||
googlePlaceId: { not: null },
|
||||
},
|
||||
});
|
||||
|
||||
const countryWithWebsite = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
country,
|
||||
OR: [
|
||||
{ hasWebsite: true },
|
||||
{ googlePlaceId: { not: null } },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
const countryNeedEnrichment = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
country,
|
||||
hasWebsite: false,
|
||||
website: null,
|
||||
},
|
||||
});
|
||||
|
||||
const websitePercent = (countryWithWebsite / total) * 100;
|
||||
const enrichedPercent = (countryEnriched / total) * 100;
|
||||
|
||||
console.log(`${country.padEnd(4)} | Total: ${String(total).padStart(6)} | Enriched: ${String(countryEnriched).padStart(5)} (${enrichedPercent.toFixed(1)}%) | With Website: ${String(countryWithWebsite).padStart(5)} (${websitePercent.toFixed(1)}%) | Need: ${String(countryNeedEnrichment).padStart(6)}`);
|
||||
}
|
||||
|
||||
console.log('');
|
||||
|
||||
// Estimate timeline
|
||||
const daysRemaining = Math.ceil(needEnrichment / 390);
|
||||
const monthsRemaining = (daysRemaining / 30).toFixed(1);
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('TIMELINE ESTIMATE');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(`At 390 churches/day:`);
|
||||
console.log(` Days remaining: ${daysRemaining} days`);
|
||||
console.log(` Months remaining: ~${monthsRemaining} months`);
|
||||
console.log(` Estimated completion: ${new Date(Date.now() + daysRemaining * 24 * 60 * 60 * 1000).toLocaleDateString()}`);
|
||||
console.log('');
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkEnrichmentStatus();
|
||||
78
scripts/debug/check-enrichment.ts
Normal file
78
scripts/debug/check-enrichment.ts
Normal file
@@ -0,0 +1,78 @@
|
||||
import { Pool } from 'pg';
|
||||
import * as dotenv from 'dotenv';
|
||||
import * as path from 'path';
|
||||
|
||||
// Load .env.local first (takes precedence), then .env
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
async function checkEnrichment() {
|
||||
try {
|
||||
console.log('Connecting to database...');
|
||||
|
||||
// Check total enriched churches
|
||||
const totalResult = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total_enriched,
|
||||
COUNT(CASE WHEN updated_at > NOW() - INTERVAL '24 hours' THEN 1 END) as enriched_last_24h,
|
||||
MAX(updated_at) as last_enrichment
|
||||
FROM churches
|
||||
WHERE google_place_id IS NOT NULL;
|
||||
`);
|
||||
|
||||
console.log('\n=== Google Enrichment Summary ===');
|
||||
console.log(`Total churches with Google Place ID: ${totalResult.rows[0].total_enriched}`);
|
||||
console.log(`Enriched in last 24 hours: ${totalResult.rows[0].enriched_last_24h}`);
|
||||
console.log(`Last enrichment: ${totalResult.rows[0].last_enrichment}`);
|
||||
|
||||
// Check by country
|
||||
const countryResult = await pool.query(`
|
||||
SELECT
|
||||
country,
|
||||
COUNT(*) as enriched_count,
|
||||
COUNT(CASE WHEN updated_at > NOW() - INTERVAL '24 hours' THEN 1 END) as enriched_last_24h
|
||||
FROM churches
|
||||
WHERE google_place_id IS NOT NULL
|
||||
GROUP BY country
|
||||
ORDER BY enriched_last_24h DESC
|
||||
LIMIT 10;
|
||||
`);
|
||||
|
||||
console.log('\n=== Top Countries Enriched (Last 24h) ===');
|
||||
countryResult.rows.forEach((row) => {
|
||||
console.log(`${row.country}: ${row.enriched_last_24h} new / ${row.enriched_count} total`);
|
||||
});
|
||||
|
||||
// Check recent enrichments with details
|
||||
const recentResult = await pool.query(`
|
||||
SELECT
|
||||
name,
|
||||
city,
|
||||
country,
|
||||
google_place_id,
|
||||
updated_at
|
||||
FROM churches
|
||||
WHERE google_place_id IS NOT NULL
|
||||
AND updated_at > NOW() - INTERVAL '24 hours'
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 20;
|
||||
`);
|
||||
|
||||
console.log('\n=== Recent Enrichments (Last 24h, sample) ===');
|
||||
recentResult.rows.forEach((row) => {
|
||||
const timestamp = row.updated_at ? new Date(row.updated_at).toISOString() : 'unknown';
|
||||
console.log(`${row.name}, ${row.city}, ${row.country} - ${timestamp}`);
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error checking enrichment:', error);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkEnrichment();
|
||||
45
scripts/debug/check-german-office-hours.ts
Normal file
45
scripts/debug/check-german-office-hours.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Check the full section text for German church to understand office hours pattern
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function checkGerman() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find Monday section
|
||||
const montagIndex = text.indexOf('montag');
|
||||
if (montagIndex !== -1) {
|
||||
const montagContext = text.substring(montagIndex, montagIndex + 200);
|
||||
console.log('=== Monday (Montag) context ===');
|
||||
console.log(montagContext);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Find Sunday section
|
||||
const sonntagIndex = text.indexOf('sonntag');
|
||||
if (sonntagIndex !== -1) {
|
||||
const sonntagContext = text.substring(sonntagIndex, sonntagIndex + 300);
|
||||
console.log('=== Sunday (Sonntag) context ===');
|
||||
console.log(sonntagContext);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
checkGerman().catch(console.error);
|
||||
51
scripts/debug/check-neon-poland.ts
Normal file
51
scripts/debug/check-neon-poland.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { config } from 'dotenv';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
// Load environment variables
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
async function main() {
|
||||
const connectionString = process.env.DATABASE_URL || '';
|
||||
console.log('DATABASE_URL:', connectionString.replace(/:[^:@]+@/, ':****@'));
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
console.log('PrismaClient created:', !!prisma);
|
||||
console.log('prisma.churches:', !!prisma.churches);
|
||||
|
||||
await prisma.$connect();
|
||||
|
||||
const count = await prisma.churches.count({ where: { country: 'PL' } });
|
||||
console.log(`Poland churches in Neon: ${count}`);
|
||||
|
||||
const withSchedules = await prisma.churches.count({
|
||||
where: {
|
||||
country: 'PL',
|
||||
massSchedules: { some: {} }
|
||||
}
|
||||
});
|
||||
console.log(`With mass schedules: ${withSchedules}`);
|
||||
|
||||
// Sample a few churches
|
||||
const sample = await prisma.churches.findMany({
|
||||
where: { country: 'PL' },
|
||||
include: { massSchedules: true },
|
||||
take: 3
|
||||
});
|
||||
|
||||
console.log('\nSample churches:');
|
||||
for (const church of sample) {
|
||||
console.log(` - ${church.name} (${church.city}): ${church.massSchedules.length} schedules`);
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
38
scripts/debug/check-niedziela-occurrences.ts
Normal file
38
scripts/debug/check-niedziela-occurrences.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function check() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('PL');
|
||||
|
||||
const result = await scraper.scrape('http://parafialubojna.pl');
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
const niedziela_matches = [];
|
||||
let idx = 0;
|
||||
while ((idx = text.indexOf('niedziela', idx)) !== -1) {
|
||||
niedziela_matches.push({
|
||||
position: idx,
|
||||
context: text.substring(Math.max(0, idx-30), idx+70)
|
||||
});
|
||||
idx++;
|
||||
}
|
||||
|
||||
console.log(`niedziela occurrences: ${niedziela_matches.length}\n`);
|
||||
niedziela_matches.forEach((m, i) => {
|
||||
console.log(`Occurrence ${i+1} at position ${m.position}:`);
|
||||
console.log(` "${m.context}"`);
|
||||
console.log('');
|
||||
});
|
||||
}
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
check();
|
||||
34
scripts/debug/check-osm-counts.ts
Normal file
34
scripts/debug/check-osm-counts.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
async function main() {
|
||||
const totalRes = await pool.query(`SELECT COUNT(*) as total FROM churches WHERE source = 'osm'`);
|
||||
console.log('Total OSM churches:', totalRes.rows[0].total);
|
||||
|
||||
const countryRes = await pool.query(`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country ORDER BY count DESC LIMIT 40`);
|
||||
console.log('\nTop 40 countries by OSM church count:');
|
||||
for (const row of countryRes.rows) {
|
||||
console.log(` ${row.country}: ${row.count}`);
|
||||
}
|
||||
|
||||
// Check key countries that were under-imported
|
||||
const keyCountries = ['AT','HR','UA','RO','LV','BY','RS','BA','MK','AL','EE','GE','AM','RU','IN','JP','CA','US','MX','AR','CO','ID','CN'];
|
||||
const keyRes = await pool.query(`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country = ANY($1) GROUP BY country ORDER BY count DESC`, [keyCountries]);
|
||||
console.log('\nKey countries to check (were under-imported):');
|
||||
const found = new Map(keyRes.rows.map((r: any) => [r.country, r.count]));
|
||||
for (const c of keyCountries) {
|
||||
console.log(` ${c}: ${found.get(c) || 0}`);
|
||||
}
|
||||
|
||||
// Total countries
|
||||
const countriesRes = await pool.query(`SELECT COUNT(DISTINCT country) as total FROM churches WHERE source = 'osm'`);
|
||||
console.log(`\nTotal countries with OSM data: ${countriesRes.rows[0].total}`);
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
main();
|
||||
88
scripts/debug/check-production-db.ts
Executable file
88
scripts/debug/check-production-db.ts
Executable file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env tsx
|
||||
|
||||
/**
|
||||
* Check production database (Neon) for data
|
||||
* Run with: npx tsx scripts/check-production-db.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { config } from 'dotenv';
|
||||
|
||||
// Load environment variables (.env.local overrides .env)
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
console.error('❌ DATABASE_URL not found in environment');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('🔍 Checking production database...');
|
||||
console.log('📍 Connection:', connectionString.includes('neon.tech') ? 'Neon (Production)' : 'localhost');
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
|
||||
async function checkDatabase() {
|
||||
try {
|
||||
// Test connection
|
||||
console.log('\n1️⃣ Testing database connection...');
|
||||
await pool.query('SELECT NOW()');
|
||||
console.log('✅ Database connection successful');
|
||||
|
||||
// Check tables exist
|
||||
console.log('\n2️⃣ Checking tables...');
|
||||
const tablesResult = await pool.query(`
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = 'public'
|
||||
ORDER BY table_name
|
||||
`);
|
||||
console.log(`✅ Found ${tablesResult.rows.length} tables:`, tablesResult.rows.map(r => r.table_name).join(', '));
|
||||
|
||||
// Check churches
|
||||
console.log('\n3️⃣ Checking churches...');
|
||||
const churchCount = await pool.query('SELECT COUNT(*) FROM "churches"');
|
||||
console.log(`📊 Churches: ${churchCount.rows[0].count}`);
|
||||
|
||||
if (parseInt(churchCount.rows[0].count) > 0) {
|
||||
const sampleChurch = await pool.query('SELECT id, name, city, state, latitude, longitude FROM "churches" LIMIT 1');
|
||||
console.log('📍 Sample church:', sampleChurch.rows[0]);
|
||||
} else {
|
||||
console.log('⚠️ No churches found in database!');
|
||||
}
|
||||
|
||||
// Check mass schedules
|
||||
console.log('\n4️⃣ Checking mass schedules...');
|
||||
const massCount = await pool.query('SELECT COUNT(*) FROM "mass_schedules"');
|
||||
console.log(`📊 Mass schedules: ${massCount.rows[0].count}`);
|
||||
|
||||
// Check liturgical days
|
||||
console.log('\n5️⃣ Checking liturgical days...');
|
||||
const liturgicalCount = await pool.query('SELECT COUNT(*) FROM "liturgical_days"');
|
||||
console.log(`📊 Liturgical days: ${liturgicalCount.rows[0].count}`);
|
||||
|
||||
// Check today's liturgical data
|
||||
const today = new Date().toISOString().split('T')[0];
|
||||
const todayData = await pool.query(
|
||||
'SELECT * FROM "liturgical_days" WHERE date = $1',
|
||||
[today]
|
||||
);
|
||||
if (todayData.rows.length > 0) {
|
||||
console.log(`✅ Today's liturgical data exists:`, todayData.rows[0].season);
|
||||
} else {
|
||||
console.log(`⚠️ No liturgical data for today (${today})`);
|
||||
}
|
||||
|
||||
console.log('\n✨ Database check complete!\n');
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkDatabase();
|
||||
164
scripts/debug/check-scraper-status.ts
Normal file
164
scripts/debug/check-scraper-status.ts
Normal file
@@ -0,0 +1,164 @@
|
||||
import { config } from 'dotenv';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
|
||||
// Load .env.local first, then .env
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
throw new Error('DATABASE_URL environment variable is not set');
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function checkScraperStatus() {
|
||||
try {
|
||||
console.log('Checking mass schedule scraper status...\n');
|
||||
|
||||
// Overall church stats
|
||||
const totalChurches = await prisma.church.count();
|
||||
|
||||
const churchesWithWebsites = await prisma.church.count({
|
||||
where: {
|
||||
OR: [
|
||||
{ website: { not: null } },
|
||||
{ massScheduleUrl: { not: null } },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
const churchesScraped = await prisma.church.count({
|
||||
where: { lastScrapedAt: { not: null } },
|
||||
});
|
||||
|
||||
// Mass schedule stats
|
||||
const totalMassSchedules = await prisma.massSchedule.count();
|
||||
|
||||
const churchesWithSchedules = await prisma.church.count({
|
||||
where: {
|
||||
massSchedules: {
|
||||
some: {},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Recently scraped (last 7 days)
|
||||
const weekAgo = new Date();
|
||||
weekAgo.setDate(weekAgo.getDate() - 7);
|
||||
|
||||
const recentlyScraped = await prisma.church.count({
|
||||
where: {
|
||||
lastScrapedAt: { gte: weekAgo },
|
||||
},
|
||||
});
|
||||
|
||||
// Get scraper sources
|
||||
const bySource = await prisma.church.groupBy({
|
||||
by: ['source'],
|
||||
_count: {
|
||||
id: true,
|
||||
},
|
||||
});
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('CHURCH DATA SOURCES');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
bySource.forEach((source) => {
|
||||
const percent = ((source._count.id / totalChurches) * 100).toFixed(1);
|
||||
console.log(`${source.source.padEnd(12)} | ${String(source._count.id).padStart(7)} churches (${percent}%)`);
|
||||
});
|
||||
console.log('');
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('MASS SCHEDULE SCRAPING STATUS');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(`Total churches: ${totalChurches.toLocaleString()}`);
|
||||
console.log(`Churches with websites: ${churchesWithWebsites.toLocaleString()} (${((churchesWithWebsites / totalChurches) * 100).toFixed(1)}%)`);
|
||||
console.log(`Churches ever scraped: ${churchesScraped.toLocaleString()} (${((churchesScraped / totalChurches) * 100).toFixed(1)}%)`);
|
||||
console.log(`Churches with mass schedules: ${churchesWithSchedules.toLocaleString()} (${((churchesWithSchedules / totalChurches) * 100).toFixed(1)}%)`);
|
||||
console.log(`Total mass schedules: ${totalMassSchedules.toLocaleString()}`);
|
||||
console.log('');
|
||||
console.log(`Scraped in last 7 days: ${recentlyScraped.toLocaleString()}`);
|
||||
console.log('');
|
||||
|
||||
// Average schedules per church
|
||||
if (churchesWithSchedules > 0) {
|
||||
const avgSchedules = totalMassSchedules / churchesWithSchedules;
|
||||
console.log(`Average schedules per church: ${avgSchedules.toFixed(1)} masses/week`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Get sample of recently scraped churches
|
||||
const recentSample = await prisma.church.findMany({
|
||||
where: {
|
||||
lastScrapedAt: { not: null },
|
||||
},
|
||||
select: {
|
||||
name: true,
|
||||
city: true,
|
||||
state: true,
|
||||
country: true,
|
||||
lastScrapedAt: true,
|
||||
website: true,
|
||||
source: true,
|
||||
_count: {
|
||||
select: {
|
||||
massSchedules: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
orderBy: { lastScrapedAt: 'desc' },
|
||||
take: 10,
|
||||
});
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('RECENTLY SCRAPED CHURCHES (Last 10)');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
if (recentSample.length === 0) {
|
||||
console.log('No churches have been scraped yet.');
|
||||
} else {
|
||||
recentSample.forEach((church, index) => {
|
||||
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
|
||||
console.log(`${index + 1}. ${church.name} (${location})`);
|
||||
console.log(` Source: ${church.source}`);
|
||||
console.log(` Website: ${church.website || 'None'}`);
|
||||
console.log(` Last scraped: ${church.lastScrapedAt?.toLocaleString() || 'Never'}`);
|
||||
console.log(` Mass schedules: ${church._count.massSchedules}`);
|
||||
console.log('');
|
||||
});
|
||||
}
|
||||
|
||||
// Churches ready to scrape (have website, not scraped)
|
||||
const readyToScrape = await prisma.church.count({
|
||||
where: {
|
||||
OR: [
|
||||
{ website: { not: null } },
|
||||
{ massScheduleUrl: { not: null } },
|
||||
],
|
||||
lastScrapedAt: null,
|
||||
},
|
||||
});
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('SCRAPING POTENTIAL');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(`Churches ready to scrape: ${readyToScrape.toLocaleString()}`);
|
||||
console.log(` (have website, never scraped)`);
|
||||
console.log('');
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkScraperStatus();
|
||||
47
scripts/debug/compare-schemas.ts
Normal file
47
scripts/debug/compare-schemas.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
import { Pool } from 'pg';
|
||||
|
||||
async function getColumns(pool: Pool, table: string) {
|
||||
const result = await pool.query(
|
||||
`SELECT column_name, data_type FROM information_schema.columns WHERE table_name = $1 ORDER BY ordinal_position`,
|
||||
[table]
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
async function run() {
|
||||
const nas = new Pool({ connectionString: 'postgresql://postgres:postgres@192.168.0.145:5434/nearestmass' });
|
||||
const neon = new Pool({
|
||||
connectionString: 'postgresql://neondb_owner:npg_sX8dxFg9KZIR@ep-plain-sky-ah15xa97-pooler.c-3.us-east-1.aws.neon.tech/neondb?sslmode=require',
|
||||
ssl: { rejectUnauthorized: false },
|
||||
});
|
||||
|
||||
for (const table of ['churches', 'mass_schedules', 'confession_schedules', 'adoration_schedules']) {
|
||||
const nasCols = await getColumns(nas, table);
|
||||
const neonCols = await getColumns(neon, table);
|
||||
|
||||
const nasNames = new Set(nasCols.map((c) => c.column_name));
|
||||
const neonNames = new Set(neonCols.map((c) => c.column_name));
|
||||
|
||||
const onlyNas = nasCols.filter((c) => !neonNames.has(c.column_name));
|
||||
const onlyNeon = neonCols.filter((c) => !nasNames.has(c.column_name));
|
||||
|
||||
if (onlyNas.length > 0 || onlyNeon.length > 0) {
|
||||
console.log(`\n=== ${table} ===`);
|
||||
if (onlyNas.length) {
|
||||
console.log(' NAS only:');
|
||||
for (const c of onlyNas) console.log(` - ${c.column_name} (${c.data_type})`);
|
||||
}
|
||||
if (onlyNeon.length) {
|
||||
console.log(' Neon only:');
|
||||
for (const c of onlyNeon) console.log(` - ${c.column_name} (${c.data_type})`);
|
||||
}
|
||||
} else {
|
||||
console.log(`\n=== ${table} === (schemas match)`);
|
||||
}
|
||||
}
|
||||
|
||||
await nas.end();
|
||||
await neon.end();
|
||||
}
|
||||
|
||||
run();
|
||||
48
scripts/debug/data-overview.ts
Normal file
48
scripts/debug/data-overview.ts
Normal file
@@ -0,0 +1,48 @@
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
async function main() {
|
||||
const c = await pool.connect();
|
||||
|
||||
const total = await c.query('SELECT count(*) FROM "Church"');
|
||||
console.log('\n=== DATABASE OVERVIEW ===');
|
||||
console.log('Churches total:', Number(total.rows[0].count).toLocaleString());
|
||||
|
||||
const withWebsite = await c.query('SELECT count(*) FROM "Church" WHERE website IS NOT NULL');
|
||||
console.log('With website:', Number(withWebsite.rows[0].count).toLocaleString());
|
||||
|
||||
const withSchedules = await c.query('SELECT count(DISTINCT "churchId") FROM "MassSchedule"');
|
||||
console.log('With mass schedules:', Number(withSchedules.rows[0].count).toLocaleString());
|
||||
|
||||
const enrichedGoogle = await c.query('SELECT count(*) FROM "Church" WHERE "googlePlaceId" IS NOT NULL');
|
||||
console.log('Google Places enriched:', Number(enrichedGoogle.rows[0].count).toLocaleString());
|
||||
|
||||
const totalSchedules = await c.query('SELECT count(*) FROM "MassSchedule"');
|
||||
console.log('Total mass schedules:', Number(totalSchedules.rows[0].count).toLocaleString());
|
||||
|
||||
const countries = await c.query('SELECT country, count(*) as cnt FROM "Church" GROUP BY country ORDER BY cnt DESC LIMIT 15');
|
||||
console.log('\n=== TOP COUNTRIES ===');
|
||||
for (const r of countries.rows) console.log(' ' + (r.country || '(null)') + ':', Number(r.cnt).toLocaleString());
|
||||
|
||||
const sources = await c.query('SELECT source, count(*) as cnt FROM "Church" GROUP BY source ORDER BY cnt DESC LIMIT 10');
|
||||
console.log('\n=== CHURCH SOURCES ===');
|
||||
for (const r of sources.rows) console.log(' ' + (r.source || '(null)') + ':', Number(r.cnt).toLocaleString());
|
||||
|
||||
const lastScrape = await c.query('SELECT "lastScrapedAt" FROM "Church" WHERE "lastScrapedAt" IS NOT NULL ORDER BY "lastScrapedAt" DESC LIMIT 1');
|
||||
console.log('\n=== LAST SCRAPE ===');
|
||||
console.log(lastScrape.rows[0]?.lastScrapedAt || 'No scrapes yet');
|
||||
|
||||
const jobs = await c.query('SELECT status, count(*) as cnt FROM "ScrapeJob" GROUP BY status ORDER BY cnt DESC');
|
||||
console.log('\n=== JOB STATUS ===');
|
||||
for (const r of jobs.rows) console.log(' ' + r.status + ':', Number(r.cnt).toLocaleString());
|
||||
|
||||
const schedulesByLang = await c.query('SELECT language, count(*) as cnt FROM "MassSchedule" GROUP BY language ORDER BY cnt DESC LIMIT 10');
|
||||
console.log('\n=== SCHEDULES BY LANGUAGE ===');
|
||||
for (const r of schedulesByLang.rows) console.log(' ' + (r.language || '(null)') + ':', Number(r.cnt).toLocaleString());
|
||||
|
||||
c.release();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error(e.message); process.exit(1); });
|
||||
58
scripts/debug/debug-french-page.ts
Normal file
58
scripts/debug/debug-french-page.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug a specific French page to see why scraping failed
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function debugPage() {
|
||||
const url = 'https://www.chemin-neuf.fr/'; // Last failed church
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules found: ${result.schedules.length}`);
|
||||
if (result.error) console.log(`Error: ${result.error}`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
console.log('\n=== Page Text Sample (first 2000 chars) ===');
|
||||
console.log(text.substring(0, 2000));
|
||||
console.log('\n');
|
||||
|
||||
// Check for French day names
|
||||
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
|
||||
console.log('=== French day names found ===');
|
||||
for (const day of frenchDays) {
|
||||
if (text.includes(day)) {
|
||||
console.log(`✓ Found: ${day}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for time patterns
|
||||
console.log('\n=== Time patterns (sample) ===');
|
||||
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:AM|PM|am|pm|Uhr|uur|h)?/g;
|
||||
const times = text.match(timeRegex);
|
||||
if (times) {
|
||||
console.log(`Found ${times.length} time-like patterns:`);
|
||||
console.log(times.slice(0, 20).join(', '));
|
||||
} else {
|
||||
console.log('No time patterns found');
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPage().catch(console.error);
|
||||
65
scripts/debug/debug-german-duplicates.ts
Normal file
65
scripts/debug/debug-german-duplicates.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug why German church has duplicate schedules
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
// Temporarily patch GenericScraper to log sections
|
||||
const originalParse = GenericScraper.prototype['parseSchedules'];
|
||||
GenericScraper.prototype['parseSchedules'] = function(html: string) {
|
||||
const text = html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Call findScheduleSections and log result
|
||||
const sections = this['findScheduleSections'](text);
|
||||
|
||||
console.log('\n=== Sections found ===\n');
|
||||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
sections.forEach((section: any, i: number) => {
|
||||
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
|
||||
console.log(` Text preview: "${section.text.substring(0, 100)}..."`);
|
||||
});
|
||||
console.log(`\nTotal sections: ${sections.length}\n`);
|
||||
|
||||
// Continue with normal processing
|
||||
const result = originalParse.call(this, html);
|
||||
|
||||
console.log(`\n=== Extracted times per section ===\n`);
|
||||
const schedsByDay: Record<number, typeof result> = {};
|
||||
for (const sched of result) {
|
||||
if (!schedsByDay[sched.dayOfWeek]) schedsByDay[sched.dayOfWeek] = [];
|
||||
schedsByDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (schedsByDay[i]) {
|
||||
console.log(`${dayNames[i]}: ${schedsByDay[i].map(s => s.time).join(', ')}`);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
async function testGerman() {
|
||||
const url = 'https://www.alterpeter.de/';
|
||||
console.log(`Testing: ${url}`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`\n=== Final Result ===`);
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Total schedules: ${result.schedules.length}`);
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
testGerman().catch(console.error);
|
||||
44
scripts/debug/debug-masstimes.ts
Normal file
44
scripts/debug/debug-masstimes.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import { chromium } from 'playwright';
|
||||
|
||||
async function main() {
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const page = await browser.newPage();
|
||||
|
||||
const url = 'https://masstimes.org/search?lat=32.7765&lng=-79.9311&type=parish';
|
||||
console.log('Loading:', url);
|
||||
|
||||
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
|
||||
|
||||
// Wait for Angular to render
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
// Take screenshot
|
||||
await page.screenshot({ path: '/tmp/masstimes-debug.png', fullPage: true });
|
||||
console.log('Screenshot saved to /tmp/masstimes-debug.png');
|
||||
|
||||
// Get page HTML
|
||||
const html = await page.content();
|
||||
console.log('\n--- PAGE HTML (first 5000 chars) ---\n');
|
||||
console.log(html.substring(0, 5000));
|
||||
|
||||
// Try to find any visible text that looks like church names
|
||||
const visibleText = await page.evaluate(() => {
|
||||
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
|
||||
const texts: string[] = [];
|
||||
let node;
|
||||
while ((node = walker.nextNode())) {
|
||||
const text = node.textContent?.trim();
|
||||
if (text && text.length > 10 && text.length < 100) {
|
||||
texts.push(text);
|
||||
}
|
||||
}
|
||||
return texts.slice(0, 50);
|
||||
});
|
||||
|
||||
console.log('\n--- VISIBLE TEXT SNIPPETS ---\n');
|
||||
visibleText.forEach((t, i) => console.log(`${i + 1}. ${t}`));
|
||||
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
74
scripts/debug/debug-paroquia-paz.ts
Normal file
74
scripts/debug/debug-paroquia-paz.ts
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Deep dive into Paróquia da Paz parsing bug
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function debugPaz() {
|
||||
const url = 'https://www.paroquiadapaz.org.br/';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('BR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find where days appear
|
||||
console.log('=== Finding day + time patterns ===\n');
|
||||
|
||||
const days = ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'];
|
||||
|
||||
for (const day of days) {
|
||||
const dayIndex = text.indexOf(day);
|
||||
if (dayIndex !== -1) {
|
||||
// Show context around the day (100 chars before and 200 after)
|
||||
const before = Math.max(0, dayIndex - 100);
|
||||
const after = Math.min(text.length, dayIndex + 200);
|
||||
const snippet = text.substring(before, after);
|
||||
|
||||
console.log(`${day.toUpperCase()}:`);
|
||||
console.log(` Position: ${dayIndex}`);
|
||||
console.log(` Context: ...${snippet}...`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
// Check for "h" time format specifically
|
||||
console.log('\n=== Checking "h" time format ===');
|
||||
const hTimeRegex = /(\d{1,2})h(\d{2})?/g;
|
||||
const hTimes = text.match(hTimeRegex);
|
||||
if (hTimes) {
|
||||
console.log(`Found ${hTimes.length} "h" format times:`);
|
||||
console.log(hTimes.slice(0, 30).join(', '));
|
||||
}
|
||||
|
||||
// Look for schedule structure
|
||||
console.log('\n=== Looking for schedule structure ===');
|
||||
const scheduleKeywords = ['horário', 'horario', 'missa', 'missas', 'santa missa'];
|
||||
for (const keyword of scheduleKeywords) {
|
||||
const index = text.indexOf(keyword);
|
||||
if (index !== -1) {
|
||||
const snippet = text.substring(index, Math.min(text.length, index + 500));
|
||||
console.log(`\nFound "${keyword}" at position ${index}:`);
|
||||
console.log(snippet.substring(0, 300));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPaz().catch(console.error);
|
||||
150
scripts/debug/debug-parsing-bugs.ts
Normal file
150
scripts/debug/debug-parsing-bugs.ts
Normal file
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug the 5 parsing bugs identified in top 5 test
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
// The churches with parsing bugs
|
||||
const BUG_CHURCHES = [
|
||||
{ name: 'St. Marien', country: 'DE', searchTerm: 'St. Marien' },
|
||||
{ name: 'Santuario de Manalagua', country: 'ES', searchTerm: 'Santuario de Manalagua' },
|
||||
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', searchTerm: 'Najświętszego Serca Pana Jez' },
|
||||
{ name: 'Paróquia de Nossa Senhora do Desterro', country: 'BR', searchTerm: 'Nossa Senhora do Desterro' },
|
||||
{ name: 'Paróquia da Paz', country: 'BR', searchTerm: 'Paróquia da Paz' },
|
||||
];
|
||||
|
||||
async function debugBugs() {
|
||||
console.log('Debugging parsing bugs...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
for (const bug of BUG_CHURCHES) {
|
||||
console.log('═'.repeat(80));
|
||||
console.log(`BUG: ${bug.name} (${bug.country})`);
|
||||
console.log('═'.repeat(80));
|
||||
|
||||
const church = await prisma.church.findFirst({
|
||||
where: {
|
||||
country: bug.country,
|
||||
name: { contains: bug.searchTerm },
|
||||
website: { not: null },
|
||||
},
|
||||
});
|
||||
|
||||
if (!church) {
|
||||
console.log(`❌ Church not found in database\n`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`Church: ${church.name}`);
|
||||
console.log(`URL: ${church.website}\n`);
|
||||
|
||||
scraper.setCountry(bug.country);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules found: ${result.schedules.length}`);
|
||||
if (result.error) console.log(`Error: ${result.error}`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
console.log('\n--- Text Sample (first 1000 chars) ---');
|
||||
console.log(text.substring(0, 1000));
|
||||
|
||||
// Check for day names
|
||||
console.log('\n--- Day Names Found ---');
|
||||
const dayPatterns: Record<string, string[]> = {
|
||||
DE: ['sonntag', 'montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag'],
|
||||
ES: ['domingo', 'lunes', 'martes', 'miércoles', 'miercoles', 'jueves', 'viernes', 'sábado', 'sabado'],
|
||||
PL: ['niedziela', 'poniedziałek', 'poniedzialek', 'wtorek', 'środa', 'sroda', 'czwartek', 'piątek', 'piatek', 'sobota'],
|
||||
BR: ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'],
|
||||
};
|
||||
|
||||
const days = dayPatterns[bug.country] || [];
|
||||
const foundDays: string[] = [];
|
||||
for (const day of days) {
|
||||
if (text.includes(day)) {
|
||||
foundDays.push(day);
|
||||
}
|
||||
}
|
||||
console.log(`Found: ${foundDays.join(', ') || 'none'}`);
|
||||
|
||||
// Check for time patterns
|
||||
console.log('\n--- Time Patterns Found ---');
|
||||
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h|uhr)?/gi;
|
||||
const times = text.match(timeRegex);
|
||||
if (times) {
|
||||
const uniqueTimes = [...new Set(times)].slice(0, 20);
|
||||
console.log(`Found ${times.length} time patterns (showing first 20 unique):`);
|
||||
console.log(uniqueTimes.join(', '));
|
||||
} else {
|
||||
console.log('No time patterns found');
|
||||
}
|
||||
|
||||
// Look for specific mass schedule keywords
|
||||
console.log('\n--- Mass Schedule Keywords ---');
|
||||
const keywords: Record<string, string[]> = {
|
||||
DE: ['gottesdienst', 'messe', 'heilige messe', 'messzeiten'],
|
||||
ES: ['misa', 'horario', 'eucaristía', 'eucaristia'],
|
||||
PL: ['msza', 'msze', 'nabożeństwo', 'nabozenstwo'],
|
||||
BR: ['missa', 'horário', 'horario', 'eucaristia'],
|
||||
};
|
||||
|
||||
const countryKeywords = keywords[bug.country] || [];
|
||||
const foundKeywords: string[] = [];
|
||||
for (const keyword of countryKeywords) {
|
||||
if (text.includes(keyword)) {
|
||||
foundKeywords.push(keyword);
|
||||
}
|
||||
}
|
||||
console.log(`Found: ${foundKeywords.join(', ') || 'none'}`);
|
||||
|
||||
// Look for specific problematic patterns
|
||||
console.log('\n--- Looking for edge cases ---');
|
||||
|
||||
// Check if times and days are separated (not in same section)
|
||||
const hasTimeBeforeDays = text.indexOf(foundDays[0] || 'zzz') > text.indexOf((times || [])[0] || 'aaa');
|
||||
console.log(`Times come before days: ${hasTimeBeforeDays ? 'YES (potential issue)' : 'no'}`);
|
||||
|
||||
// Check for table structures
|
||||
const hasTables = text.includes('colspan') || text.includes('rowspan') || (text.match(/\s+\|\s+/g)?.length || 0) > 5;
|
||||
console.log(`Likely table format: ${hasTables ? 'YES (may need special handling)' : 'no'}`);
|
||||
|
||||
// Check for multiple languages on same page
|
||||
const hasMultiLang = (text.match(/english|español|espanol|portuguese|português|portugues|deutsch|polski/gi)?.length || 0) > 1;
|
||||
console.log(`Multiple languages: ${hasMultiLang ? 'YES (may confuse parser)' : 'no'}`);
|
||||
}
|
||||
|
||||
console.log('\n');
|
||||
} catch (err: any) {
|
||||
console.log(`❌ ERROR: ${err.message}\n`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
debugBugs().catch(console.error);
|
||||
98
scripts/debug/debug-paz-full-flow.ts
Normal file
98
scripts/debug/debug-paz-full-flow.ts
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug the full parsing flow with section detection
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
async function debugFullFlow() {
|
||||
const url = 'https://www.paroquiadapaz.org.br/';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('BR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
if (!result.rawHtml) {
|
||||
console.log('No HTML received');
|
||||
await scraper.close();
|
||||
return;
|
||||
}
|
||||
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find the schedule section
|
||||
const scheduleIndex = text.indexOf('segundas, terças');
|
||||
if (scheduleIndex === -1) {
|
||||
console.log('Schedule text not found!');
|
||||
await scraper.close();
|
||||
return;
|
||||
}
|
||||
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
|
||||
console.log('Schedule snippet from actual HTML:');
|
||||
console.log(snippet);
|
||||
console.log('\n');
|
||||
|
||||
// Now test section matching on actual text
|
||||
const dayConfigs = getDayNamesForCountry('BR');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== Testing sábados and domingos matches ===\n');
|
||||
|
||||
// Test sábados
|
||||
const sabadosRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])sábados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const sabadosMatch = snippet.match(sabadosRegex);
|
||||
console.log('sábados match:', sabadosMatch ? `Found: "${sabadosMatch[1].substring(0, 50)}"` : 'Not found');
|
||||
|
||||
// Test sabados (no accent)
|
||||
const sabadosRegex2 = new RegExp(
|
||||
`(?:^|\\s|[,;:])sabados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const sabadosMatch2 = snippet.match(sabadosRegex2);
|
||||
console.log('sabados match:', sabadosMatch2 ? `Found: "${sabadosMatch2[1].substring(0, 50)}"` : 'Not found');
|
||||
|
||||
// Test domingos
|
||||
const domingosRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])domingos[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const domingosMatch = snippet.match(domingosRegex);
|
||||
console.log('domingos match:', domingosMatch ? `Found: "${domingosMatch[1].substring(0, 50)}"` : 'Not found');
|
||||
|
||||
console.log('\n=== Final parsed schedules ===\n');
|
||||
console.log(`Total: ${result.schedules.length}`);
|
||||
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
console.log(`${dayNames[i]}: ${byDay[i].length} schedules`);
|
||||
} else {
|
||||
console.log(`${dayNames[i]}: 0 schedules ❌`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugFullFlow().catch(console.error);
|
||||
56
scripts/debug/debug-paz-sections.ts
Normal file
56
scripts/debug/debug-paz-sections.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug which sections are being found
|
||||
*/
|
||||
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
// Simulate the exact text from the page
|
||||
const scheduleText = `
|
||||
horário das missas igreja matriz de santo antônio
|
||||
segundas, terças, quartas e sextas-feiras: 16h e 18h.
|
||||
quintas-feiras: 16h e 19h (adoração ao santíssimo – 18h).
|
||||
sábados: 8h, 16h e 18h.
|
||||
domingos: 8h, 11h, 16h, 18h e 20h.
|
||||
`.toLowerCase();
|
||||
|
||||
console.log('Text to parse:');
|
||||
console.log(scheduleText);
|
||||
console.log('');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('BR');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== COMMA-SEPARATED GROUP MATCHING ===\n');
|
||||
|
||||
const dayGroupRegex = new RegExp(
|
||||
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
|
||||
'gi'
|
||||
);
|
||||
|
||||
let groupMatch;
|
||||
let matchCount = 0;
|
||||
while ((groupMatch = dayGroupRegex.exec(scheduleText)) !== null) {
|
||||
matchCount++;
|
||||
console.log(`Match #${matchCount}:`);
|
||||
console.log(` Day group: "${groupMatch[1]}"`);
|
||||
console.log(` Time text: "${groupMatch[2]}"`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
console.log('=== INDIVIDUAL DAY MATCHING ===\n');
|
||||
|
||||
for (const [dayName, dayIndex] of Object.entries(dayPatterns)) {
|
||||
const escaped = dayName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(
|
||||
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const match = scheduleText.match(regex);
|
||||
if (match) {
|
||||
console.log(`Found ${dayName} (day ${dayIndex}):`);
|
||||
console.log(` Time text: "${match[1].substring(0, 100)}"`);
|
||||
}
|
||||
}
|
||||
85
scripts/debug/debug-paz-with-logging.ts
Normal file
85
scripts/debug/debug-paz-with-logging.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug Paróquia da Paz with added logging
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
async function debugPazWithLogging() {
|
||||
const url = 'https://www.paroquiadapaz.org.br/';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('BR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Test the regex pattern manually
|
||||
console.log('=== Testing comma-separated day grouping regex ===\n');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('BR');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('Day patterns:', Object.keys(dayPatterns).join(', '));
|
||||
console.log('');
|
||||
|
||||
// The exact regex from the code
|
||||
const dayGroupRegex = new RegExp(
|
||||
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
|
||||
'gi'
|
||||
);
|
||||
|
||||
console.log('Regex pattern:', dayGroupRegex.source.substring(0, 200) + '...\n');
|
||||
|
||||
let groupMatch;
|
||||
let matchCount = 0;
|
||||
while ((groupMatch = dayGroupRegex.exec(text)) !== null) {
|
||||
matchCount++;
|
||||
console.log(`Match #${matchCount}:`);
|
||||
console.log(` Full match: "${groupMatch[0].substring(0, 100)}"`);
|
||||
console.log(` Day group: "${groupMatch[1]}"`);
|
||||
console.log(` Time text: "${groupMatch[2].substring(0, 50)}"`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (matchCount === 0) {
|
||||
console.log('No matches found!\n');
|
||||
|
||||
// Try to find the schedule text manually
|
||||
const scheduleIndex = text.indexOf('segundas, terças');
|
||||
if (scheduleIndex !== -1) {
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
|
||||
console.log('Found schedule text at position', scheduleIndex);
|
||||
console.log('Snippet:', snippet);
|
||||
console.log('');
|
||||
|
||||
// Test if individual day names are matching
|
||||
console.log('Testing individual day name matches in snippet:');
|
||||
for (const dayName of sortedDayNames.slice(0, 10)) {
|
||||
if (snippet.includes(dayName)) {
|
||||
console.log(` ✓ Found: ${dayName}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPazWithLogging().catch(console.error);
|
||||
85
scripts/debug/debug-polish-church.ts
Normal file
85
scripts/debug/debug-polish-church.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug Polish church in detail
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
async function debugPolish() {
|
||||
const url = 'http://parafialubojna.pl';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('PL');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules found: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find the schedule section
|
||||
const scheduleIndex = text.indexOf('msze święte') || text.indexOf('msze swiete');
|
||||
if (scheduleIndex !== -1) {
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
|
||||
console.log('Schedule section:');
|
||||
console.log(snippet);
|
||||
console.log('\n');
|
||||
|
||||
// Test all time pattern matches
|
||||
console.log('=== Testing time pattern matches ===\n');
|
||||
|
||||
// Space separator pattern
|
||||
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||||
const spaceMatches = snippet.match(spacePattern);
|
||||
console.log('Space-separated times (8 00, 9 30):');
|
||||
console.log(spaceMatches ? spaceMatches.join(', ') : 'none');
|
||||
console.log('');
|
||||
|
||||
// Colon pattern
|
||||
const colonPattern = /\d{1,2}:\d{2}/g;
|
||||
const colonMatches = snippet.match(colonPattern);
|
||||
console.log('Colon times (8:00, 9:30):');
|
||||
console.log(colonMatches ? colonMatches.join(', ') : 'none');
|
||||
console.log('');
|
||||
|
||||
// Polish day names
|
||||
console.log('=== Polish day names in snippet ===\n');
|
||||
const dayConfigs = getDayNamesForCountry('PL');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
|
||||
for (const [dayName, dayNum] of Object.entries(dayPatterns)) {
|
||||
if (snippet.includes(dayName)) {
|
||||
console.log(`Found: ${dayName} (day ${dayNum})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n=== Parsed schedules ===\n');
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
console.log(`${dayNames[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPolish().catch(console.error);
|
||||
79
scripts/debug/debug-polish-sunday-monday.ts
Normal file
79
scripts/debug/debug-polish-sunday-monday.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug why Sunday and Monday aren't parsing for Polish church
|
||||
*/
|
||||
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
// Exact schedule text from website
|
||||
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
|
||||
|
||||
console.log('Text to parse:');
|
||||
console.log(text);
|
||||
console.log('\n');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('PL');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== Testing niedziela (Sunday) ===\n');
|
||||
|
||||
// Current regex pattern
|
||||
const niedziela = 'niedziela';
|
||||
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(
|
||||
`(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const match = text.match(regex);
|
||||
if (match) {
|
||||
console.log(`✓ Matched!`);
|
||||
console.log(` Full match: "${match[0].substring(0, 100)}"`);
|
||||
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
|
||||
console.log('');
|
||||
|
||||
// Check if times can be extracted
|
||||
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||||
const times = match[1].match(spacePattern);
|
||||
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ NOT matched`);
|
||||
}
|
||||
|
||||
console.log('\n=== Testing poniedziałek (Monday) ===\n');
|
||||
|
||||
const ponieRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const ponieMatch = text.match(ponieRegex);
|
||||
if (ponieMatch) {
|
||||
console.log(`✓ Matched!`);
|
||||
console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`);
|
||||
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
|
||||
console.log('');
|
||||
|
||||
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
|
||||
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ NOT matched`);
|
||||
}
|
||||
|
||||
console.log('\n=== Analyzing why niedziela might fail ===\n');
|
||||
|
||||
// The issue might be "niedziela i uroczystości:" - the phrase is long
|
||||
// Check if the lookahead is hitting "uroczystości" before getting to the times
|
||||
const niedziela_index = text.indexOf('niedziela');
|
||||
const next_day_index = Math.min(
|
||||
...sortedDayNames
|
||||
.filter(d => d !== 'niedziela')
|
||||
.map(d => text.indexOf(d, niedziela_index))
|
||||
.filter(i => i > 0)
|
||||
);
|
||||
|
||||
console.log(`niedziela position: ${niedziela_index}`);
|
||||
console.log(`Next day name position: ${next_day_index}`);
|
||||
console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);
|
||||
44
scripts/debug/debug-thursday-context.ts
Normal file
44
scripts/debug/debug-thursday-context.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find "montag bis donnerstag" pattern
|
||||
const pattern = /montag[^]*?bis[^]*?donnerstag/gi;
|
||||
const matches = [...text.matchAll(pattern)];
|
||||
|
||||
console.log(`Found ${matches.length} instances of "montag bis donnerstag":\n`);
|
||||
|
||||
for (let i = 0; i < matches.length; i++) {
|
||||
const match = matches[i];
|
||||
const matchIndex = match.index || 0;
|
||||
const contextBefore = text.substring(Math.max(0, matchIndex - 150), matchIndex);
|
||||
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 250));
|
||||
|
||||
console.log(`=== Instance ${i + 1} ===`);
|
||||
console.log(`Position: ${matchIndex}`);
|
||||
console.log(`\nContext BEFORE (150 chars):`);
|
||||
console.log(`"${contextBefore}"`);
|
||||
console.log(`\nContext AFTER (250 chars):`);
|
||||
console.log(`"${contextAfter}"`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
45
scripts/debug/debug-zero-time.ts
Normal file
45
scripts/debug/debug-zero-time.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find all instances of "00 uhr" pattern
|
||||
let idx = 0;
|
||||
let count = 0;
|
||||
const pattern = /\b00\s*uhr/g;
|
||||
let match;
|
||||
|
||||
console.log('Looking for "00 uhr" patterns:\n');
|
||||
|
||||
while ((match = pattern.exec(text)) !== null) {
|
||||
count++;
|
||||
const matchIndex = match.index;
|
||||
const contextBefore = text.substring(Math.max(0, matchIndex - 50), matchIndex);
|
||||
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 100));
|
||||
|
||||
console.log(`=== Occurrence ${count} at position ${matchIndex} ===`);
|
||||
console.log(`BEFORE: "...${contextBefore}"`);
|
||||
console.log(`MATCH + AFTER: "${contextAfter}..."`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
console.log(`Total "00 uhr" occurrences: ${count}`);
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
37
scripts/debug/export-de-from-neon.ts
Normal file
37
scripts/debug/export-de-from-neon.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { config } from 'dotenv';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import fs from 'fs/promises';
|
||||
|
||||
config({ path: '.env.local' });
|
||||
|
||||
async function main() {
|
||||
console.log('📦 Exporting Germany from Neon...');
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
await prisma.$connect();
|
||||
|
||||
const churches = await prisma.churches.findMany({
|
||||
where: { country: 'DE' },
|
||||
include: {
|
||||
massSchedules: true,
|
||||
confessionSchedules: true,
|
||||
adorationSchedules: true,
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Found ${churches.length} churches in Germany`);
|
||||
|
||||
await fs.writeFile('export-DE.json', JSON.stringify(churches, null, 2));
|
||||
console.log(`✅ Exported to export-DE.json`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
60
scripts/debug/export-from-nas.ts
Normal file
60
scripts/debug/export-from-nas.ts
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Export churches from NAS database to JSON
|
||||
* Run this ON THE NAS (uses DATABASE_URL from .env)
|
||||
*/
|
||||
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import fs from 'fs/promises';
|
||||
|
||||
async function main() {
|
||||
const country = process.argv[2] || 'PL';
|
||||
|
||||
console.log(`📦 Exporting ${country} data from database...`);
|
||||
console.log(`DATABASE_URL: ${process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const prisma = new PrismaClient();
|
||||
|
||||
try {
|
||||
await prisma.$connect();
|
||||
console.log('✅ Connected to database');
|
||||
|
||||
// Export churches with all schedules
|
||||
const churches = await prisma.churches.findMany({
|
||||
where: { country },
|
||||
include: {
|
||||
massSchedules: true,
|
||||
confessionSchedules: true,
|
||||
adorationSchedules: true,
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Found ${churches.length} churches in ${country}`);
|
||||
|
||||
// Count schedules
|
||||
const massSchedules = churches.reduce((sum, c) => sum + (c.massSchedules?.length || 0), 0);
|
||||
const confessionSchedules = churches.reduce((sum, c) => sum + (c.confessionSchedules?.length || 0), 0);
|
||||
const adorationSchedules = churches.reduce((sum, c) => sum + (c.adorationSchedules?.length || 0), 0);
|
||||
|
||||
// Save to file
|
||||
const exportFile = `export-${country}.json`;
|
||||
await fs.writeFile(exportFile, JSON.stringify(churches, null, 2));
|
||||
|
||||
console.log(`\n✅ Exported to ${exportFile}`);
|
||||
console.log(` - ${churches.length} churches`);
|
||||
console.log(` - ${massSchedules} mass schedules`);
|
||||
console.log(` - ${confessionSchedules} confession schedules`);
|
||||
console.log(` - ${adorationSchedules} adoration schedules`);
|
||||
console.log(`\nDownload with:`);
|
||||
console.log(` scp albert@192.168.0.145:/volume1/docker/nearestmass/${exportFile} .`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Export failed:', error);
|
||||
await prisma.$disconnect();
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
230
scripts/debug/export-import-to-neon.ts
Normal file
230
scripts/debug/export-import-to-neon.ts
Normal file
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Export churches from local NAS database and import to Neon
|
||||
*/
|
||||
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
interface ExportStats {
|
||||
churches: number;
|
||||
massSchedules: number;
|
||||
confessionSchedules: number;
|
||||
adorationSchedules: number;
|
||||
}
|
||||
|
||||
async function exportFromNAS(country: string): Promise<ExportStats> {
|
||||
console.log(`📦 Exporting ${country} data from NAS...`);
|
||||
|
||||
// Set DATABASE_URL to NAS
|
||||
const originalUrl = process.env.DATABASE_URL;
|
||||
process.env.DATABASE_URL = 'postgresql://postgres:postgres@192.168.0.145:5432/nearestmass';
|
||||
|
||||
const nasPrisma = new PrismaClient();
|
||||
|
||||
try {
|
||||
await nasPrisma.$connect();
|
||||
console.log('✅ Connected to NAS database');
|
||||
|
||||
// Export churches with all schedules
|
||||
const churches = await nasPrisma.churches.findMany({
|
||||
where: { country },
|
||||
include: {
|
||||
massSchedules: true,
|
||||
confessionSchedules: true,
|
||||
adorationSchedules: true,
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Found ${churches.length} churches in ${country}`);
|
||||
|
||||
// Count schedules
|
||||
const stats: ExportStats = {
|
||||
churches: churches.length,
|
||||
massSchedules: churches.reduce((sum, c) => sum + (c.massSchedules?.length || 0), 0),
|
||||
confessionSchedules: churches.reduce((sum, c) => sum + (c.confessionSchedules?.length || 0), 0),
|
||||
adorationSchedules: churches.reduce((sum, c) => sum + (c.adorationSchedules?.length || 0), 0),
|
||||
};
|
||||
|
||||
// Save to file
|
||||
const exportFile = path.join(process.cwd(), `export-${country}.json`);
|
||||
await fs.writeFile(exportFile, JSON.stringify(churches, null, 2));
|
||||
console.log(`✅ Exported to ${exportFile}`);
|
||||
console.log(` - ${stats.churches} churches`);
|
||||
console.log(` - ${stats.massSchedules} mass schedules`);
|
||||
console.log(` - ${stats.confessionSchedules} confession schedules`);
|
||||
console.log(` - ${stats.adorationSchedules} adoration schedules`);
|
||||
|
||||
await nasPrisma.$disconnect();
|
||||
|
||||
// Restore original DATABASE_URL
|
||||
if (originalUrl) {
|
||||
process.env.DATABASE_URL = originalUrl;
|
||||
}
|
||||
|
||||
return stats;
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Export failed:', error);
|
||||
await nasPrisma.$disconnect();
|
||||
|
||||
// Restore original DATABASE_URL
|
||||
if (originalUrl) {
|
||||
process.env.DATABASE_URL = originalUrl;
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function importToNeon(country: string, dryRun: boolean = true): Promise<void> {
|
||||
console.log(`\n📤 Importing ${country} data to Neon...`);
|
||||
if (dryRun) {
|
||||
console.log('🔍 DRY RUN MODE - No data will be written');
|
||||
}
|
||||
|
||||
// Read export file
|
||||
const exportFile = path.join(process.cwd(), `export-${country}.json`);
|
||||
const data = JSON.parse(await fs.readFile(exportFile, 'utf-8'));
|
||||
console.log(`Loaded ${data.length} churches from export file`);
|
||||
|
||||
// Connect to Neon
|
||||
const neonPrisma = new PrismaClient();
|
||||
|
||||
try {
|
||||
await neonPrisma.$connect();
|
||||
console.log('✅ Connected to Neon database');
|
||||
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
let errors = 0;
|
||||
|
||||
for (const church of data) {
|
||||
try {
|
||||
const massSchedules = church.massSchedules || [];
|
||||
const confessionSchedules = church.confessionSchedules || [];
|
||||
const adorationSchedules = church.adorationSchedules || [];
|
||||
|
||||
// Remove relations and ids
|
||||
delete church.massSchedules;
|
||||
delete church.confessionSchedules;
|
||||
delete church.adorationSchedules;
|
||||
delete church.id;
|
||||
|
||||
if (!dryRun) {
|
||||
// Upsert church based on coordinates
|
||||
const result = await neonPrisma.churches.upsert({
|
||||
where: {
|
||||
latitude_longitude: {
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude
|
||||
}
|
||||
},
|
||||
create: church,
|
||||
update: church
|
||||
});
|
||||
|
||||
// Check if it was an insert or update
|
||||
const existing = await neonPrisma.churches.findFirst({
|
||||
where: {
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
createdAt: { lt: new Date(Date.now() - 1000) } // Created more than 1 sec ago
|
||||
}
|
||||
});
|
||||
|
||||
if (existing) {
|
||||
updated++;
|
||||
} else {
|
||||
inserted++;
|
||||
}
|
||||
|
||||
// Insert schedules
|
||||
for (const schedule of massSchedules) {
|
||||
delete schedule.id;
|
||||
await neonPrisma.massSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (const schedule of confessionSchedules) {
|
||||
delete schedule.id;
|
||||
await neonPrisma.confessionSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (const schedule of adorationSchedules) {
|
||||
delete schedule.id;
|
||||
await neonPrisma.adorationSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Dry run - just count
|
||||
inserted++;
|
||||
}
|
||||
|
||||
if (inserted % 100 === 0) {
|
||||
console.log(`Progress: ${inserted + updated} churches processed...`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
errors++;
|
||||
console.error(`Error importing church ${church.name}:`, error instanceof Error ? error.message : error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n✅ Import complete!');
|
||||
console.log(` - ${inserted} churches inserted`);
|
||||
console.log(` - ${updated} churches updated`);
|
||||
console.log(` - ${errors} errors`);
|
||||
|
||||
await neonPrisma.$disconnect();
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Import failed:', error);
|
||||
await neonPrisma.$disconnect();
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const country = process.argv[2] || 'PL';
|
||||
const mode = process.argv[3] || 'dry-run';
|
||||
const dryRun = mode === 'dry-run';
|
||||
|
||||
console.log('🌍 Export/Import to Neon');
|
||||
console.log('========================\n');
|
||||
|
||||
try {
|
||||
// Step 1: Export from NAS
|
||||
const stats = await exportFromNAS(country);
|
||||
|
||||
// Step 2: Import to Neon
|
||||
await importToNeon(country, dryRun);
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n💡 This was a DRY RUN. To actually import to Neon, run:');
|
||||
console.log(` npx tsx scripts/export-import-to-neon.ts ${country} real-import`);
|
||||
} else {
|
||||
console.log('\n🎉 Data successfully uploaded to Neon!');
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Process failed:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
41
scripts/debug/find-donnerstag-sections.ts
Normal file
41
scripts/debug/find-donnerstag-sections.ts
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find all instances of "donnerstag" (Thursday)
|
||||
let idx = 0;
|
||||
let count = 0;
|
||||
while ((idx = text.indexOf('donnerstag', idx)) !== -1) {
|
||||
count++;
|
||||
const contextBefore = text.substring(Math.max(0, idx - 100), idx);
|
||||
const contextAfter = text.substring(idx, Math.min(text.length, idx + 200));
|
||||
|
||||
console.log(`=== Donnerstag occurrence ${count} at position ${idx} ===`);
|
||||
console.log(`BEFORE: "...${contextBefore}"`);
|
||||
console.log(`AFTER: "${contextAfter}..."`);
|
||||
console.log('');
|
||||
|
||||
idx++;
|
||||
}
|
||||
|
||||
console.log(`Total "donnerstag" occurrences: ${count}`);
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
42
scripts/debug/find-office-hours-pattern.ts
Normal file
42
scripts/debug/find-office-hours-pattern.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
const idx = text.indexOf('9.00 – 12.00');
|
||||
if (idx !== -1) {
|
||||
console.log('Context around "9.00 – 12.00":');
|
||||
console.log(text.substring(Math.max(0, idx - 150), idx + 200));
|
||||
} else {
|
||||
console.log('Pattern "9.00 – 12.00" not found');
|
||||
|
||||
// Try alternative patterns
|
||||
const patterns = ['9.00', '9:00', '09:00', '09.00'];
|
||||
for (const pattern of patterns) {
|
||||
const idx2 = text.indexOf(pattern);
|
||||
if (idx2 !== -1) {
|
||||
console.log(`\nFound "${pattern}" at position ${idx2}:`);
|
||||
console.log(text.substring(Math.max(0, idx2 - 100), idx2 + 150));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
102
scripts/debug/identify-top5-bugs.ts
Normal file
102
scripts/debug/identify-top5-bugs.ts
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Identify which churches are flagged as "parsing bugs" in top 5 test
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const COUNTRIES = [
|
||||
{ code: 'FR', name: 'France' },
|
||||
{ code: 'DE', name: 'Germany' },
|
||||
{ code: 'ES', name: 'Spain' },
|
||||
{ code: 'PL', name: 'Poland' },
|
||||
{ code: 'BR', name: 'Brazil' },
|
||||
];
|
||||
|
||||
async function identifyBugs() {
|
||||
console.log('Identifying "parsing bugs" from top 5 test...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
const bugs: Array<{
|
||||
country: string;
|
||||
church: string;
|
||||
url: string;
|
||||
hasDays: boolean;
|
||||
hasTimes: boolean;
|
||||
}> = [];
|
||||
|
||||
for (const country of COUNTRIES) {
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: country.code,
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: 10,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
scraper.setCountry(country.code);
|
||||
|
||||
for (const church of churches) {
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
if (!result.success && result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check for day names and times
|
||||
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|domingos|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|sábados|sabados|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|segundas|terça|terca|terças|tercas|quarta|quartas|quinta|quintas|sexta|sextas)\b/i);
|
||||
|
||||
const hasTimes = text.match(/\d{1,2}[h:\.]?\s*\d{0,2}\s*(am|pm|h|uhr)?/i);
|
||||
|
||||
if (hasDays && hasTimes) {
|
||||
bugs.push({
|
||||
country: country.name,
|
||||
church: church.name,
|
||||
url: church.website!,
|
||||
hasDays: !!hasDays,
|
||||
hasTimes: !!hasTimes,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
// Skip errors
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log(`FOUND ${bugs.length} POTENTIAL PARSING BUGS\n`);
|
||||
|
||||
bugs.forEach((bug, i) => {
|
||||
console.log(`${i + 1}. ${bug.church} (${bug.country})`);
|
||||
console.log(` URL: ${bug.url}`);
|
||||
console.log('');
|
||||
});
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
identifyBugs().catch(console.error);
|
||||
232
scripts/debug/import-to-neon.ts
Normal file
232
scripts/debug/import-to-neon.ts
Normal file
@@ -0,0 +1,232 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import churches from JSON export to Neon database
|
||||
* Run this LOCALLY (uses DATABASE_URL from .env pointing to Neon)
|
||||
*/
|
||||
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
interface ChurchExport {
|
||||
id: string;
|
||||
name: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
country: string;
|
||||
massSchedules?: any[];
|
||||
confessionSchedules?: any[];
|
||||
adorationSchedules?: any[];
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const country = process.argv[2] || 'PL';
|
||||
const mode = process.argv[3] || 'dry-run';
|
||||
const dryRun = mode === 'dry-run';
|
||||
|
||||
console.log(`📤 Importing ${country} data to Neon...`);
|
||||
console.log(`DATABASE_URL: ${process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
if (dryRun) {
|
||||
console.log('🔍 DRY RUN MODE - No data will be written');
|
||||
}
|
||||
|
||||
// Read export file
|
||||
const exportFile = path.join(process.cwd(), `export-${country}.json`);
|
||||
|
||||
try {
|
||||
const data: ChurchExport[] = JSON.parse(await fs.readFile(exportFile, 'utf-8'));
|
||||
console.log(`Loaded ${data.length} churches from export file`);
|
||||
|
||||
// Connect to Neon
|
||||
const prisma = new PrismaClient();
|
||||
|
||||
try {
|
||||
await prisma.$connect();
|
||||
console.log('✅ Connected to Neon database');
|
||||
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
let skipped = 0;
|
||||
let errors = 0;
|
||||
let totalMassSchedules = 0;
|
||||
let totalConfessionSchedules = 0;
|
||||
let totalAdorationSchedules = 0;
|
||||
|
||||
for (const church of data) {
|
||||
try {
|
||||
const massSchedules = church.massSchedules || [];
|
||||
const confessionSchedules = church.confessionSchedules || [];
|
||||
const adorationSchedules = church.adorationSchedules || [];
|
||||
|
||||
// Remove relations and ids
|
||||
delete church.massSchedules;
|
||||
delete church.confessionSchedules;
|
||||
delete church.adorationSchedules;
|
||||
delete church.id;
|
||||
|
||||
if (!dryRun) {
|
||||
// Check if church already exists
|
||||
const existing = await prisma.churches.findFirst({
|
||||
where: {
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude
|
||||
}
|
||||
});
|
||||
|
||||
if (existing) {
|
||||
// Update existing church
|
||||
await prisma.churches.update({
|
||||
where: { id: existing.id },
|
||||
data: church
|
||||
});
|
||||
|
||||
// Delete existing schedules
|
||||
await prisma.massSchedules.deleteMany({
|
||||
where: { churchId: existing.id }
|
||||
});
|
||||
await prisma.confessionSchedules.deleteMany({
|
||||
where: { churchId: existing.id }
|
||||
});
|
||||
await prisma.adorationSchedules.deleteMany({
|
||||
where: { churchId: existing.id }
|
||||
});
|
||||
|
||||
// Insert new schedules
|
||||
for (const schedule of massSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.massSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: existing.id
|
||||
}
|
||||
});
|
||||
totalMassSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of confessionSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.confessionSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: existing.id
|
||||
}
|
||||
});
|
||||
totalConfessionSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of adorationSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.adorationSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: existing.id
|
||||
}
|
||||
});
|
||||
totalAdorationSchedules++;
|
||||
}
|
||||
|
||||
updated++;
|
||||
} else {
|
||||
// Create new church
|
||||
const result = await prisma.churches.create({
|
||||
data: church
|
||||
});
|
||||
|
||||
// Insert schedules
|
||||
for (const schedule of massSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.massSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
totalMassSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of confessionSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.confessionSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
totalConfessionSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of adorationSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.adorationSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
totalAdorationSchedules++;
|
||||
}
|
||||
|
||||
inserted++;
|
||||
}
|
||||
} else {
|
||||
// Dry run - just count
|
||||
inserted++;
|
||||
totalMassSchedules += massSchedules.length;
|
||||
totalConfessionSchedules += confessionSchedules.length;
|
||||
totalAdorationSchedules += adorationSchedules.length;
|
||||
}
|
||||
|
||||
if ((inserted + updated) % 100 === 0) {
|
||||
console.log(`Progress: ${inserted + updated} churches processed...`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
errors++;
|
||||
console.error(`Error importing church ${church.name}:`, error instanceof Error ? error.message : error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n✅ Import complete!');
|
||||
console.log(` - ${inserted} churches inserted`);
|
||||
console.log(` - ${updated} churches updated`);
|
||||
console.log(` - ${skipped} churches skipped`);
|
||||
console.log(` - ${errors} errors`);
|
||||
console.log(` - ${totalMassSchedules} mass schedules`);
|
||||
console.log(` - ${totalConfessionSchedules} confession schedules`);
|
||||
console.log(` - ${totalAdorationSchedules} adoration schedules`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n💡 This was a DRY RUN. To actually import to Neon, run:');
|
||||
console.log(` npx tsx scripts/import-to-neon.ts ${country} real-import`);
|
||||
} else {
|
||||
console.log('\n🎉 Data successfully uploaded to Neon!');
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Import failed:', error);
|
||||
await prisma.$disconnect();
|
||||
throw error;
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
|
||||
console.error(`❌ Export file not found: ${exportFile}`);
|
||||
console.error(`\nFirst, export data from NAS:`);
|
||||
console.error(` ssh albert@192.168.0.145`);
|
||||
console.error(` cd /volume1/docker/nearestmass`);
|
||||
console.error(` /usr/local/bin/docker compose --profile tools run --rm scraper npx tsx scripts/export-from-nas.ts ${country}`);
|
||||
console.error(`\nThen download the export:`);
|
||||
console.error(` scp albert@192.168.0.145:/volume1/docker/nearestmass/export-${country}.json .`);
|
||||
console.error(`\nFinally, run this import script again.`);
|
||||
} else {
|
||||
console.error('❌ Process failed:', error);
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
84
scripts/debug/investigate-8-bugs.ts
Normal file
84
scripts/debug/investigate-8-bugs.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Investigate the 8 potential parsing bugs
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const BUGS = [
|
||||
{ name: 'Chapelle Saint-Jean-XXIII', country: 'FR', url: 'https://www.chemin-neuf.fr/' },
|
||||
{ name: 'St. Marien', country: 'DE', url: 'https://www.willehad.de/start/' },
|
||||
{ name: 'Iglesia de San Fernando', country: 'ES', url: 'https://www.parroquiasanfernandomaspalomas.net/de/' },
|
||||
{ name: 'Monestir de Sant Esperit', country: 'ES', url: 'https://www.santoespiritu.org/' },
|
||||
{ name: 'Santuario de Manalagua', country: 'ES', url: 'http://tierrasdeburgos.blogspot.com.es/2013/12/escultura-del-agua-santuario-de.html' },
|
||||
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', url: 'http://parafialubojna.pl' },
|
||||
{ name: 'Paróquia do Desterro', country: 'BR', url: 'https://paroquiaportodegalinhas.blogspot.com.br/' },
|
||||
{ name: 'Catedral Diocesana', country: 'BR', url: 'http://diocesedejuazeiro.org.br/' },
|
||||
];
|
||||
|
||||
async function investigate() {
|
||||
console.log('Investigating 8 potential bugs...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
for (let i = 0; i < BUGS.length; i++) {
|
||||
const bug = BUGS[i];
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
console.log(`${i + 1}. ${bug.name} (${bug.country})`);
|
||||
console.log(` ${bug.url}`);
|
||||
console.log('='.repeat(80));
|
||||
|
||||
scraper.setCountry(bug.country);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(bug.url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}`);
|
||||
console.log(`Error: ${result.error || 'none'}`);
|
||||
|
||||
if (!result.success && result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check page type
|
||||
console.log('\nPage analysis:');
|
||||
if (text.includes('blogspot')) {
|
||||
console.log(' ⚠️ Blogspot page (likely blog post, not church website)');
|
||||
}
|
||||
if (text.includes('hotel') || text.includes('reservation') || text.includes('booking')) {
|
||||
console.log(' ⚠️ Contains hotel/booking keywords');
|
||||
}
|
||||
if (text.includes('restaurant') || text.includes('menu')) {
|
||||
console.log(' ⚠️ Contains restaurant keywords');
|
||||
}
|
||||
if (text.includes('404') || text.includes('not found') || text.includes('error')) {
|
||||
console.log(' ⚠️ Error/404 page');
|
||||
}
|
||||
|
||||
// Check if it has schedule keywords
|
||||
const hasScheduleKeywords = text.match(/(mass|messe|misa|missa|horário|horario|gottesdienst|eucarist)/i);
|
||||
console.log(` Schedule keywords: ${hasScheduleKeywords ? '✓ Found' : '✗ Not found'}`);
|
||||
|
||||
// Show sample text
|
||||
const massIndex = text.indexOf('mass') || text.indexOf('messe') || text.indexOf('misa') || text.indexOf('missa') || 0;
|
||||
const sampleStart = Math.max(0, massIndex - 50);
|
||||
const sample = text.substring(sampleStart, sampleStart + 300);
|
||||
console.log(`\n Sample text: "${sample.substring(0, 200)}..."`);
|
||||
}
|
||||
|
||||
console.log('\n');
|
||||
} catch (err: any) {
|
||||
console.log(`ERROR: ${err.message}\n\n`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
investigate().catch(console.error);
|
||||
134
scripts/debug/list-church-websites.ts
Normal file
134
scripts/debug/list-church-websites.ts
Normal file
@@ -0,0 +1,134 @@
|
||||
import { config } from 'dotenv';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
|
||||
// Load .env.local first, then .env
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
throw new Error('DATABASE_URL environment variable is not set');
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function listChurchWebsites() {
|
||||
try {
|
||||
console.log('Fetching churches from database...\n');
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
city: true,
|
||||
state: true,
|
||||
country: true,
|
||||
website: true,
|
||||
googlePlaceId: true,
|
||||
},
|
||||
orderBy: [
|
||||
{ country: 'asc' },
|
||||
{ state: 'asc' },
|
||||
{ city: 'asc' },
|
||||
],
|
||||
});
|
||||
|
||||
console.log(`Total churches: ${churches.length}`);
|
||||
|
||||
const withWebsite = churches.filter(c => c.website);
|
||||
const withGoogle = churches.filter(c => c.googlePlaceId);
|
||||
const withoutWebsite = churches.filter(c => !c.website);
|
||||
|
||||
console.log(`Churches with website: ${withWebsite.length}`);
|
||||
console.log(`Churches with Google Place ID: ${withGoogle.length}`);
|
||||
console.log(`Churches without website: ${withoutWebsite.length}\n`);
|
||||
|
||||
// Group by country
|
||||
const byCountry = churches.reduce((acc, church) => {
|
||||
const country = church.country || 'Unknown';
|
||||
if (!acc[country]) {
|
||||
acc[country] = [];
|
||||
}
|
||||
acc[country].push(church);
|
||||
return acc;
|
||||
}, {} as Record<string, typeof churches>);
|
||||
|
||||
// Write to file
|
||||
let output = '# Church Websites\n\n';
|
||||
output += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
output += `## Summary\n`;
|
||||
output += `- Total churches: ${churches.length}\n`;
|
||||
output += `- With website: ${withWebsite.length} (${((withWebsite.length / churches.length) * 100).toFixed(1)}%)\n`;
|
||||
output += `- With Google Place ID: ${withGoogle.length} (${((withGoogle.length / churches.length) * 100).toFixed(1)}%)\n`;
|
||||
output += `- Without website: ${withoutWebsite.length} (${((withoutWebsite.length / churches.length) * 100).toFixed(1)}%)\n\n`;
|
||||
|
||||
// Add country breakdown
|
||||
output += `## By Country\n\n`;
|
||||
Object.entries(byCountry)
|
||||
.sort(([, a], [, b]) => b.length - a.length)
|
||||
.forEach(([country, countryChurches]) => {
|
||||
const withSite = countryChurches.filter(c => c.website).length;
|
||||
const withGoogle = countryChurches.filter(c => c.googlePlaceId).length;
|
||||
output += `### ${country} (${countryChurches.length} churches)\n`;
|
||||
output += `- With website: ${withSite} (${((withSite / countryChurches.length) * 100).toFixed(1)}%)\n`;
|
||||
output += `- With Google Place ID: ${withGoogle} (${((withGoogle / countryChurches.length) * 100).toFixed(1)}%)\n\n`;
|
||||
});
|
||||
|
||||
// List all websites
|
||||
output += `## All Websites\n\n`;
|
||||
Object.entries(byCountry)
|
||||
.sort(([a], [b]) => a.localeCompare(b))
|
||||
.forEach(([country, countryChurches]) => {
|
||||
output += `### ${country}\n\n`;
|
||||
countryChurches.forEach(church => {
|
||||
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
|
||||
if (church.website) {
|
||||
output += `- **${church.name}** (${location})\n`;
|
||||
output += ` - Website: ${church.website}\n`;
|
||||
if (church.googlePlaceId) {
|
||||
output += ` - Google Place ID: ${church.googlePlaceId}\n`;
|
||||
}
|
||||
output += ` - DB ID: ${church.id}\n\n`;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// List churches without websites
|
||||
output += `## Churches Without Websites\n\n`;
|
||||
Object.entries(byCountry)
|
||||
.sort(([a], [b]) => a.localeCompare(b))
|
||||
.forEach(([country, countryChurches]) => {
|
||||
const without = countryChurches.filter(c => !c.website);
|
||||
if (without.length > 0) {
|
||||
output += `### ${country}\n\n`;
|
||||
without.forEach(church => {
|
||||
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
|
||||
output += `- **${church.name}** (${location})\n`;
|
||||
if (church.googlePlaceId) {
|
||||
output += ` - Google Place ID: ${church.googlePlaceId}\n`;
|
||||
}
|
||||
output += ` - DB ID: ${church.id}\n\n`;
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Write to file
|
||||
const fs = await import('fs/promises');
|
||||
await fs.writeFile('church-websites.md', output);
|
||||
console.log('✓ Written to church-websites.md');
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
listChurchWebsites();
|
||||
44
scripts/debug/list-tables.ts
Normal file
44
scripts/debug/list-tables.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import { Pool } from 'pg';
|
||||
import * as dotenv from 'dotenv';
|
||||
import * as path from 'path';
|
||||
|
||||
// Load .env.local first (takes precedence), then .env
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
async function listTables() {
|
||||
try {
|
||||
console.log('Connecting to database...');
|
||||
console.log('DATABASE_URL:', process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@'));
|
||||
|
||||
// List all tables
|
||||
const result = await pool.query(`
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = 'public'
|
||||
ORDER BY table_name;
|
||||
`);
|
||||
|
||||
console.log('\n=== Tables in Database ===');
|
||||
if (result.rows.length === 0) {
|
||||
console.log('No tables found!');
|
||||
} else {
|
||||
result.rows.forEach((row) => {
|
||||
console.log(`- ${row.table_name}`);
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`\nTotal: ${result.rows.length} tables`);
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error listing tables:', error);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
listTables();
|
||||
167
scripts/debug/pipeline-report.js
Normal file
167
scripts/debug/pipeline-report.js
Normal file
@@ -0,0 +1,167 @@
|
||||
const { Client } = require("pg");
|
||||
const client = new Client({
|
||||
connectionString: "postgresql://postgres:postgres@192.168.0.145:5434/nearestmass"
|
||||
});
|
||||
|
||||
const queries = [
|
||||
{
|
||||
name: "1. Overall church counts by country (top 20)",
|
||||
sql: `SELECT country, COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
|
||||
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped,
|
||||
COUNT(*) FILTER (WHERE has_website = true) as has_website_flag,
|
||||
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_language
|
||||
FROM churches
|
||||
GROUP BY country
|
||||
ORDER BY total DESC
|
||||
LIMIT 20`
|
||||
},
|
||||
{
|
||||
name: "2. Total mass schedule counts",
|
||||
sql: `SELECT COUNT(*) as total_schedules,
|
||||
COUNT(DISTINCT church_id) as churches_with_schedules
|
||||
FROM mass_schedules`
|
||||
},
|
||||
{
|
||||
name: "3. Scrape results by language",
|
||||
sql: `SELECT website_language as language,
|
||||
COUNT(*) as total_scraped,
|
||||
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
|
||||
FROM churches
|
||||
WHERE website_language IS NOT NULL
|
||||
GROUP BY website_language
|
||||
ORDER BY total_scraped DESC`
|
||||
},
|
||||
{
|
||||
name: "4. Churches with websites but never scraped",
|
||||
sql: `SELECT COUNT(*) as has_website_not_scraped
|
||||
FROM churches
|
||||
WHERE website IS NOT NULL AND last_scraped_at IS NULL`
|
||||
},
|
||||
{
|
||||
name: "5. Overall pipeline funnel",
|
||||
sql: `SELECT
|
||||
COUNT(*) as total_churches,
|
||||
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
|
||||
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as attempted_scrape,
|
||||
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_detected_language,
|
||||
(SELECT COUNT(DISTINCT church_id) FROM mass_schedules) as has_schedules_saved,
|
||||
(SELECT COUNT(*) FROM mass_schedules) as total_schedule_rows
|
||||
FROM churches`
|
||||
},
|
||||
{
|
||||
name: "6. Recent scrape activity (last 7 days) by language",
|
||||
sql: `SELECT website_language as language,
|
||||
COUNT(*) as scraped_last_7d
|
||||
FROM churches
|
||||
WHERE last_scraped_at > NOW() - INTERVAL '7 days'
|
||||
GROUP BY website_language
|
||||
ORDER BY scraped_last_7d DESC`
|
||||
},
|
||||
{
|
||||
name: "7. Background job history (last 15 completed/failed jobs)",
|
||||
sql: `SELECT type, language, status,
|
||||
created_at::date as created,
|
||||
completed_at::date as completed,
|
||||
ROUND(CAST(EXTRACT(EPOCH FROM (completed_at - created_at))/3600 AS numeric), 2) as hours,
|
||||
total_items, processed, succeeded, failed
|
||||
FROM background_jobs
|
||||
WHERE status IN ('completed', 'failed')
|
||||
ORDER BY completed_at DESC
|
||||
LIMIT 15`
|
||||
},
|
||||
{
|
||||
name: "8. Mass schedule breakdown by day of week",
|
||||
sql: `SELECT day_of_week,
|
||||
CASE day_of_week
|
||||
WHEN 0 THEN 'Sunday' WHEN 1 THEN 'Monday' WHEN 2 THEN 'Tuesday'
|
||||
WHEN 3 THEN 'Wednesday' WHEN 4 THEN 'Thursday' WHEN 5 THEN 'Friday'
|
||||
WHEN 6 THEN 'Saturday' ELSE 'Other'
|
||||
END as day_name,
|
||||
COUNT(*) as count
|
||||
FROM mass_schedules
|
||||
GROUP BY day_of_week
|
||||
ORDER BY day_of_week`
|
||||
},
|
||||
{
|
||||
name: "9. Churches with schedules by country (top 15)",
|
||||
sql: `SELECT c.country,
|
||||
COUNT(DISTINCT c.id) as total_churches,
|
||||
COUNT(DISTINCT ms.church_id) as churches_with_schedules,
|
||||
ROUND(100.0 * COUNT(DISTINCT ms.church_id) / NULLIF(COUNT(DISTINCT c.id), 0), 1) as coverage_pct,
|
||||
COUNT(ms.id) as total_schedule_rows
|
||||
FROM churches c
|
||||
LEFT JOIN mass_schedules ms ON ms.church_id = c.id
|
||||
GROUP BY c.country
|
||||
ORDER BY total_churches DESC
|
||||
LIMIT 15`
|
||||
},
|
||||
{
|
||||
name: "10. Enrichment sources - how churches were found",
|
||||
sql: `SELECT source, COUNT(*) as count
|
||||
FROM churches
|
||||
GROUP BY source
|
||||
ORDER BY count DESC`
|
||||
},
|
||||
{
|
||||
name: "11. Google Places enrichment impact",
|
||||
sql: `SELECT
|
||||
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
|
||||
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL AND website IS NOT NULL) as google_with_website,
|
||||
COUNT(*) FILTER (WHERE google_place_id IS NULL) as no_google_place,
|
||||
COUNT(*) FILTER (WHERE google_searched_at IS NOT NULL) as google_searched,
|
||||
COUNT(*) FILTER (WHERE free_searched_at IS NOT NULL) as free_searched
|
||||
FROM churches`
|
||||
},
|
||||
{
|
||||
name: "12. Website presence by source",
|
||||
sql: `SELECT source,
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
|
||||
ROUND(100.0 * COUNT(*) FILTER (WHERE website IS NOT NULL) / NULLIF(COUNT(*), 0), 1) as website_pct,
|
||||
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
|
||||
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
|
||||
FROM churches
|
||||
GROUP BY source
|
||||
ORDER BY total DESC`
|
||||
}
|
||||
];
|
||||
|
||||
async function run() {
|
||||
await client.connect();
|
||||
|
||||
for (const q of queries) {
|
||||
console.log("=".repeat(90));
|
||||
console.log(q.name);
|
||||
console.log("=".repeat(90));
|
||||
try {
|
||||
const res = await client.query(q.sql);
|
||||
if (res.rows.length === 0) {
|
||||
console.log("(no rows returned)");
|
||||
} else {
|
||||
// Calculate column widths
|
||||
const cols = Object.keys(res.rows[0]);
|
||||
const widths = cols.map(c => {
|
||||
const maxData = Math.max(...res.rows.map(r => String(r[c] ?? "NULL").length));
|
||||
return Math.max(c.length, maxData);
|
||||
});
|
||||
|
||||
// Print header
|
||||
console.log(cols.map((c, i) => c.padEnd(widths[i])).join(" | "));
|
||||
console.log(widths.map(w => "-".repeat(w)).join("-+-"));
|
||||
|
||||
// Print rows
|
||||
for (const row of res.rows) {
|
||||
console.log(cols.map((c, i) => String(row[c] ?? "NULL").padEnd(widths[i])).join(" | "));
|
||||
}
|
||||
}
|
||||
console.log("\n(" + res.rows.length + " rows)\n");
|
||||
} catch (err) {
|
||||
console.log("ERROR:", err.message, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
await client.end();
|
||||
}
|
||||
|
||||
run().catch(e => { console.error(e); process.exit(1); });
|
||||
48
scripts/debug/show-french-success.ts
Normal file
48
scripts/debug/show-french-success.ts
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Show detailed output from a successful French parse
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function showSuccess() {
|
||||
// One of our successful churches with 16 schedules
|
||||
const url = 'https://laportelatine.org/lieux/couvent-saint-francois-morgon';
|
||||
console.log(`Detailed parse of: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`✅ Success: ${result.success}`);
|
||||
console.log(`📅 Schedules found: ${result.schedules.length}\n`);
|
||||
|
||||
// Group by day
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Dimanche', 'Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi'];
|
||||
|
||||
console.log('═══════════════════════════════════════════════');
|
||||
console.log('PARSED SCHEDULE:');
|
||||
console.log('═══════════════════════════════════════════════\n');
|
||||
|
||||
Object.entries(byDay)
|
||||
.sort(([a], [b]) => parseInt(a) - parseInt(b))
|
||||
.forEach(([day, scheds]) => {
|
||||
console.log(`${dayNames[parseInt(day)]}:`);
|
||||
scheds.forEach(s => {
|
||||
console.log(` ${s.time} - ${s.language} ${s.massType}`);
|
||||
});
|
||||
console.log('');
|
||||
});
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
showSuccess().catch(console.error);
|
||||
28
scripts/debug/test-db-connection.ts
Normal file
28
scripts/debug/test-db-connection.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test database connection
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
console.log('DATABASE_URL exists:', !!process.env.DATABASE_URL);
|
||||
console.log('DATABASE_URL value:', process.env.DATABASE_URL?.substring(0, 50) + '...');
|
||||
|
||||
import { prisma } from '../../src/lib/db';
|
||||
|
||||
async function testConnection() {
|
||||
try {
|
||||
const count = await prisma.church.count();
|
||||
console.log(`✅ Database connection successful!`);
|
||||
console.log(`Total churches in database: ${count}`);
|
||||
} catch (err: any) {
|
||||
console.log(`❌ Database connection failed:`);
|
||||
console.log(err.message);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
testConnection();
|
||||
180
scripts/debug/test-french-broader.ts
Normal file
180
scripts/debug/test-french-broader.ts
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test more French churches and collect diagnostic data
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
interface DiagnosticInfo {
|
||||
url: string;
|
||||
churchName: string;
|
||||
success: boolean;
|
||||
schedulesFound: number;
|
||||
hasFrenchDays: boolean;
|
||||
hasTimePatterns: boolean;
|
||||
timePatternsSample: string[];
|
||||
textSample: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function testFrenchBroader() {
|
||||
console.log('Testing 20 French churches with diagnostics...\n');
|
||||
|
||||
// Get more French churches
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: 'FR',
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: 20,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (churches.length === 0) {
|
||||
console.log('No French churches found.');
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Found ${churches.length} French churches to test\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
let successCount = 0;
|
||||
let failCount = 0;
|
||||
const diagnostics: DiagnosticInfo[] = [];
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name} (${church.city || 'Unknown'})`);
|
||||
console.log(`URL: ${church.website}`);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
// Extract diagnostics
|
||||
let hasFrenchDays = false;
|
||||
let hasTimePatterns = false;
|
||||
let timePatternsSample: string[] = [];
|
||||
let textSample = '';
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
textSample = text.substring(0, 500);
|
||||
|
||||
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
|
||||
hasFrenchDays = frenchDays.some(day => text.includes(day));
|
||||
|
||||
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h)?/g;
|
||||
const times = text.match(timeRegex);
|
||||
if (times) {
|
||||
hasTimePatterns = true;
|
||||
timePatternsSample = [...new Set(times)].slice(0, 10);
|
||||
}
|
||||
}
|
||||
|
||||
const diagnostic: DiagnosticInfo = {
|
||||
url: church.website!,
|
||||
churchName: church.name,
|
||||
success: result.success,
|
||||
schedulesFound: result.schedules.length,
|
||||
hasFrenchDays,
|
||||
hasTimePatterns,
|
||||
timePatternsSample,
|
||||
textSample,
|
||||
error: result.error,
|
||||
};
|
||||
|
||||
diagnostics.push(diagnostic);
|
||||
|
||||
if (result.success && result.schedules.length > 0) {
|
||||
successCount++;
|
||||
console.log(`✅ SUCCESS - ${result.schedules.length} schedules`);
|
||||
} else {
|
||||
failCount++;
|
||||
console.log(`❌ FAILED - ${result.error}`);
|
||||
if (hasFrenchDays && !hasTimePatterns) {
|
||||
console.log(` 💡 Has French days but no times`);
|
||||
} else if (!hasFrenchDays && hasTimePatterns) {
|
||||
console.log(` 💡 Has times but no French days`);
|
||||
} else if (hasFrenchDays && hasTimePatterns) {
|
||||
console.log(` 💡 Has BOTH days and times - parsing issue!`);
|
||||
console.log(` Sample times: ${timePatternsSample.slice(0, 5).join(', ')}`);
|
||||
} else {
|
||||
console.log(` 💡 No mass schedule content found`);
|
||||
}
|
||||
}
|
||||
console.log('');
|
||||
} catch (err: any) {
|
||||
failCount++;
|
||||
console.log(`❌ ERROR - ${err.message}\n`);
|
||||
diagnostics.push({
|
||||
url: church.website!,
|
||||
churchName: church.name,
|
||||
success: false,
|
||||
schedulesFound: 0,
|
||||
hasFrenchDays: false,
|
||||
hasTimePatterns: false,
|
||||
timePatternsSample: [],
|
||||
textSample: '',
|
||||
error: err.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
// Analysis
|
||||
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
|
||||
console.log('');
|
||||
|
||||
const hasBoth = diagnostics.filter(d => !d.success && d.hasFrenchDays && d.hasTimePatterns);
|
||||
const hasDaysNoTimes = diagnostics.filter(d => !d.success && d.hasFrenchDays && !d.hasTimePatterns);
|
||||
const hasTimesNoDays = diagnostics.filter(d => !d.success && !d.hasFrenchDays && d.hasTimePatterns);
|
||||
const hasNeither = diagnostics.filter(d => !d.success && !d.hasFrenchDays && !d.hasTimePatterns);
|
||||
|
||||
console.log('FAILURE ANALYSIS:');
|
||||
console.log(` Has days + times but failed: ${hasBoth.length} (PARSING BUG)`);
|
||||
console.log(` Has days but no times: ${hasDaysNoTimes.length}`);
|
||||
console.log(` Has times but no days: ${hasTimesNoDays.length}`);
|
||||
console.log(` Has neither: ${hasNeither.length} (no mass schedule on page)`);
|
||||
console.log('');
|
||||
|
||||
if (hasBoth.length > 0) {
|
||||
console.log('⚠️ PARSING BUGS TO FIX (has both days and times but failed):');
|
||||
hasBoth.forEach(d => {
|
||||
console.log(` ${d.churchName}`);
|
||||
console.log(` URL: ${d.url}`);
|
||||
console.log(` Sample times found: ${d.timePatternsSample.slice(0, 5).join(', ')}`);
|
||||
console.log(` Text sample: ${d.textSample.substring(0, 150)}...`);
|
||||
console.log('');
|
||||
});
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
testFrenchBroader().catch(console.error);
|
||||
100
scripts/debug/test-french-scraper.ts
Executable file
100
scripts/debug/test-french-scraper.ts
Executable file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test international scraper against French churches
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function testFrenchScraper() {
|
||||
console.log('Testing French church mass schedule scraping...\n');
|
||||
|
||||
// Get French churches with websites
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: 'FR',
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: 5,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (churches.length === 0) {
|
||||
console.log('No French churches with websites found.');
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Found ${churches.length} French churches to test:\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
let successCount = 0;
|
||||
let failCount = 0;
|
||||
|
||||
for (const church of churches) {
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`Church: ${church.name}`);
|
||||
console.log(`City: ${church.city || 'Unknown'}`);
|
||||
console.log(`URL: ${church.website}`);
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
if (result.success && result.schedules.length > 0) {
|
||||
successCount++;
|
||||
console.log(`✅ SUCCESS - Found ${result.schedules.length} schedules\n`);
|
||||
|
||||
// Group by day and show
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Dimanche', 'Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi'];
|
||||
Object.entries(byDay).forEach(([day, scheds]) => {
|
||||
console.log(` ${dayNames[parseInt(day)]}:`);
|
||||
scheds.forEach(s => {
|
||||
console.log(` ${s.time} - ${s.language || 'Unknown'} (${s.massType || 'Mass'})`);
|
||||
});
|
||||
});
|
||||
console.log('');
|
||||
} else {
|
||||
failCount++;
|
||||
console.log(`❌ FAILED - ${result.error}`);
|
||||
console.log('');
|
||||
}
|
||||
} catch (err: any) {
|
||||
failCount++;
|
||||
console.log(`❌ ERROR - ${err.message}`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
|
||||
console.log(`Success: ${successCount}, Failed: ${failCount}\n`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
testFrenchScraper().catch(console.error);
|
||||
210
scripts/debug/test-international-sample.ts
Normal file
210
scripts/debug/test-international-sample.ts
Normal file
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test scraper on a diverse sample of international churches
|
||||
* to identify edge cases across different languages and formats
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
interface TestChurch {
|
||||
name: string;
|
||||
url: string;
|
||||
country: string;
|
||||
language: string;
|
||||
expectedDays?: string; // e.g., "Sun-Sat" or "Sun, Wed, Sat"
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
// Sample churches from different countries/languages
|
||||
const testChurches: TestChurch[] = [
|
||||
// FRENCH
|
||||
{
|
||||
name: 'Saint-Étienne du Mont, Paris',
|
||||
url: 'https://www.saintetiennedumontparis.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'French format with "du lundi au vendredi"',
|
||||
},
|
||||
{
|
||||
name: 'Notre-Dame de la Garde, Marseille',
|
||||
url: 'https://www.notredamedelagarde.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'Major pilgrimage site',
|
||||
},
|
||||
|
||||
// GERMAN
|
||||
{
|
||||
name: 'St. Peter, Munich',
|
||||
url: 'https://www.alterpeter.de/',
|
||||
country: 'DE',
|
||||
language: 'German',
|
||||
notes: 'German format with "bis" for ranges',
|
||||
},
|
||||
{
|
||||
name: 'Kölner Dom, Cologne',
|
||||
url: 'https://www.koelner-dom.de/',
|
||||
country: 'DE',
|
||||
language: 'German',
|
||||
notes: 'Cathedral with Uhr time format',
|
||||
},
|
||||
|
||||
// SPANISH
|
||||
{
|
||||
name: 'Sagrada Família, Barcelona',
|
||||
url: 'https://sagradafamilia.org/',
|
||||
country: 'ES',
|
||||
language: 'Spanish',
|
||||
notes: 'Major tourist site, may have complex schedule',
|
||||
},
|
||||
{
|
||||
name: 'Parroquia San Miguel, Madrid',
|
||||
url: 'https://www.parroquiasanmiguel.es/',
|
||||
country: 'ES',
|
||||
language: 'Spanish',
|
||||
notes: 'Spanish format with "de lunes a viernes"',
|
||||
},
|
||||
|
||||
// PORTUGUESE
|
||||
{
|
||||
name: 'Basílica da Estrela, Lisbon',
|
||||
url: 'https://www.basilicadaestrela.com/',
|
||||
country: 'PT',
|
||||
language: 'Portuguese',
|
||||
notes: 'Portuguese format',
|
||||
},
|
||||
|
||||
// ITALIAN
|
||||
{
|
||||
name: 'Santa Maria Maggiore, Rome',
|
||||
url: 'https://www.vatican.va/various/basiliche/sm_maggiore/index_it.htm',
|
||||
country: 'IT',
|
||||
language: 'Italian',
|
||||
notes: 'Major basilica',
|
||||
},
|
||||
{
|
||||
name: 'Duomo di Milano',
|
||||
url: 'https://www.duomomilano.it/',
|
||||
country: 'IT',
|
||||
language: 'Italian',
|
||||
notes: 'Cathedral with Italian format',
|
||||
},
|
||||
|
||||
// DUTCH
|
||||
{
|
||||
name: 'Basiliek van de H. Nicolaas, Amsterdam',
|
||||
url: 'https://www.nicolaas-parochie.nl/',
|
||||
country: 'NL',
|
||||
language: 'Dutch',
|
||||
notes: 'Dutch format with "tot" for ranges',
|
||||
},
|
||||
|
||||
// CZECH
|
||||
{
|
||||
name: 'Chrám sv. Víta, Prague',
|
||||
url: 'https://www.katedralasvatehovita.cz/',
|
||||
country: 'CZ',
|
||||
language: 'Czech',
|
||||
notes: 'Czech format',
|
||||
},
|
||||
|
||||
// HUNGARIAN
|
||||
{
|
||||
name: 'Szent István Bazilika, Budapest',
|
||||
url: 'https://www.bazilika.biz/',
|
||||
country: 'HU',
|
||||
language: 'Hungarian',
|
||||
notes: 'Hungarian format',
|
||||
},
|
||||
|
||||
// More complex cases
|
||||
{
|
||||
name: 'Cathédrale Notre-Dame, Strasbourg',
|
||||
url: 'https://www.cathedrale-strasbourg.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'Bilingual region (French/German)',
|
||||
},
|
||||
];
|
||||
|
||||
async function testChurch(church: TestChurch, scraper: GenericScraper): Promise<void> {
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log(`📍 ${church.name}`);
|
||||
console.log(` ${church.url}`);
|
||||
console.log(` Language: ${church.language} | Country: ${church.country}`);
|
||||
if (church.notes) console.log(` Notes: ${church.notes}`);
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
|
||||
try {
|
||||
scraper.setCountry(church.country);
|
||||
const result = await scraper.scrape(church.url);
|
||||
|
||||
if (!result.success) {
|
||||
console.log(`❌ FAILED: ${result.error || 'Unknown error'}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (result.schedules.length === 0) {
|
||||
console.log(`⚠️ SUCCESS but NO SCHEDULES found`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Group by day
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
console.log(`\n✅ Found ${result.schedules.length} schedules:\n`);
|
||||
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
const times = byDay[i].map(s => {
|
||||
let str = s.time;
|
||||
if (s.massType) str += ` (${s.massType})`;
|
||||
if (s.language && s.language !== 'English') str += ` [${s.language}]`;
|
||||
return str;
|
||||
}).join(', ');
|
||||
console.log(` ${dayNames[i]}: ${times}`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.log(`❌ ERROR: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
console.log('🌍 INTERNATIONAL CHURCH SCRAPER TEST');
|
||||
console.log(`Testing ${testChurches.length} churches across ${new Set(testChurches.map(c => c.country)).size} countries`);
|
||||
|
||||
const results: { success: number; failed: number; noSchedules: number } = {
|
||||
success: 0,
|
||||
failed: 0,
|
||||
noSchedules: 0,
|
||||
};
|
||||
|
||||
for (const church of testChurches) {
|
||||
await testChurch(church, scraper);
|
||||
|
||||
// Brief delay between requests to be respectful
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log('📊 SUMMARY');
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
console.log(`Total tested: ${testChurches.length}`);
|
||||
console.log(`✅ Success with schedules: ${results.success}`);
|
||||
console.log(`⚠️ Success but no schedules: ${results.noSchedules}`);
|
||||
console.log(`❌ Failed: ${results.failed}`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
36
scripts/debug/test-masstimes-api.ts
Normal file
36
scripts/debug/test-masstimes-api.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
/**
|
||||
* Quick test script to verify the masstimes.org JSON API scraper works
|
||||
* Usage: npx tsx scripts/test-masstimes-api.ts
|
||||
*/
|
||||
|
||||
import { writeFileSync } from 'fs';
|
||||
import { MassTimesScraper } from '../../src/lib/masstimes-scraper';
|
||||
|
||||
async function main() {
|
||||
console.log('Testing MassTimes.org JSON API Scraper\n');
|
||||
|
||||
const scraper = new MassTimesScraper();
|
||||
|
||||
try {
|
||||
await scraper.init();
|
||||
console.log('Browser initialized\n');
|
||||
|
||||
const lat = 34.852;
|
||||
const lng = -82.394;
|
||||
console.log(`Fetching churches near Greenville, SC (${lat}, ${lng})...\n`);
|
||||
|
||||
const churches = await scraper.scrapeByLocation(lat, lng);
|
||||
|
||||
const outPath = 'scraped-churches.json';
|
||||
writeFileSync(outPath, JSON.stringify(churches, null, 2));
|
||||
console.log(`\nSaved ${churches.length} churches to ${outPath}`);
|
||||
|
||||
} catch (error) {
|
||||
console.error('TEST FAILED:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
70
scripts/debug/test-polish-sections.ts
Normal file
70
scripts/debug/test-polish-sections.ts
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test which sections are being created for Polish church
|
||||
*/
|
||||
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
// Exact text from the page
|
||||
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
|
||||
|
||||
console.log('Text:');
|
||||
console.log(text);
|
||||
console.log('\n');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('PL');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== Testing individual day matching ===\n');
|
||||
|
||||
// Test niedziela specifically
|
||||
const niedziela = 'niedziela';
|
||||
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(
|
||||
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const match = text.match(regex);
|
||||
if (match) {
|
||||
console.log(`✓ niedziela matched!`);
|
||||
console.log(` Full match: "${match[0].substring(0, 100)}"`);
|
||||
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
|
||||
console.log('');
|
||||
|
||||
// Test if times can be extracted from captured text
|
||||
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||||
const times = match[1].match(spacePattern);
|
||||
console.log(` Times in captured text: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ niedziela NOT matched`);
|
||||
console.log('');
|
||||
|
||||
// Try simpler regex
|
||||
const simpleRegex = /niedziela[:\s]+(.{0,100})/i;
|
||||
const simpleMatch = text.match(simpleRegex);
|
||||
if (simpleMatch) {
|
||||
console.log(`Simple regex matched: "${simpleMatch[1]}"`);
|
||||
}
|
||||
}
|
||||
|
||||
// Test poniedziałek
|
||||
console.log('\n=== Testing poniedziałek ===\n');
|
||||
|
||||
const ponieRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])poniedziałek[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const ponieMatch = text.match(ponieRegex);
|
||||
if (ponieMatch) {
|
||||
console.log(`✓ poniedziałek matched!`);
|
||||
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
|
||||
|
||||
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
|
||||
console.log(` Times: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ poniedziałek NOT matched`);
|
||||
}
|
||||
65
scripts/debug/test-polish-with-logging.ts
Normal file
65
scripts/debug/test-polish-with-logging.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test Polish church with detailed section logging
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
// Temporarily modify GenericScraper to add logging
|
||||
const originalParse = GenericScraper.prototype['parseSchedules'];
|
||||
GenericScraper.prototype['parseSchedules'] = function(html: string) {
|
||||
const text = html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Call findScheduleSections and log result
|
||||
const sections = this['findScheduleSections'](text);
|
||||
|
||||
console.log('\n=== Sections found by findScheduleSections() ===\n');
|
||||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
sections.forEach((section: any, i: number) => {
|
||||
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
|
||||
console.log(` Text: "${section.text.substring(0, 80)}..."`);
|
||||
});
|
||||
console.log(`\nTotal sections: ${sections.length}\n`);
|
||||
|
||||
// Continue with normal processing
|
||||
return originalParse.call(this, html);
|
||||
};
|
||||
|
||||
async function testPolish() {
|
||||
const url = 'http://parafialubojna.pl';
|
||||
console.log(`Testing: ${url}`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('PL');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`\nFinal result: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.schedules.length > 0) {
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNamesPL = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
|
||||
console.log('Parsed schedules by day:');
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
console.log(` ${dayNamesPL[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
testPolish().catch(console.error);
|
||||
49
scripts/debug/test-time-extraction.ts
Normal file
49
scripts/debug/test-time-extraction.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test which pattern is matching "00" time
|
||||
*/
|
||||
|
||||
// Test text from German church
|
||||
const testText = "10:00 uhr lateinisches amt";
|
||||
|
||||
const timePatterns = [
|
||||
{ name: '12-hour AM/PM', pattern: /(\d{1,2}):(\d{2})\s*(AM|PM|am|pm|a\.m\.|p\.m\.)/g },
|
||||
{ name: '12-hour no minutes', pattern: /(?<![:\d])(\d{1,2})\s*(AM|PM|am|pm|a\.m\.|p\.m\.)/g },
|
||||
{ name: '24-hour colon', pattern: /(?<![:\d\w])(\d{1,2}):(\d{2})(?!\s*(AM|PM|am|pm))/g },
|
||||
{ name: 'French/Portuguese h', pattern: /(?<![:\d\w])(\d{1,2})\s*h\s*(\d{2})?(?!\w)/gi },
|
||||
{ name: 'Italian period', pattern: /(?<![:\d\w])(\d{1,2})\.(\d{2})(?=\s|$|,|;|\)|\])/g },
|
||||
{ name: 'German Uhr (old)', pattern: /(\d{1,2})[:\.]?(\d{2})?\s*Uhr/gi },
|
||||
{ name: 'German Uhr (fixed)', pattern: /(?<![:\d])(\d{1,2})[:\.]?(\d{2})?\s*Uhr/gi },
|
||||
{ name: 'Polish space', pattern: /\b(\d{1,2})\s+(\d{2})(?!\d)/g },
|
||||
];
|
||||
|
||||
console.log(`Test text: "${testText}"\n`);
|
||||
|
||||
for (const { name, pattern } of timePatterns) {
|
||||
const matches = [...testText.matchAll(pattern)];
|
||||
if (matches.length > 0) {
|
||||
console.log(`✓ ${name}:`);
|
||||
for (const match of matches) {
|
||||
console.log(` Matched: "${match[0]}" at index ${match.index}`);
|
||||
}
|
||||
} else {
|
||||
console.log(`✗ ${name}: no match`);
|
||||
}
|
||||
}
|
||||
|
||||
// Now test with just "00 uhr"
|
||||
console.log(`\n${'='.repeat(60)}\n`);
|
||||
const testText2 = "00 uhr lateinisches";
|
||||
console.log(`Test text: "${testText2}"\n`);
|
||||
|
||||
for (const { name, pattern } of timePatterns) {
|
||||
const matches = [...testText2.matchAll(pattern)];
|
||||
if (matches.length > 0) {
|
||||
console.log(`✓ ${name}:`);
|
||||
for (const match of matches) {
|
||||
console.log(` Matched: "${match[0]}" at index ${match.index}`);
|
||||
}
|
||||
} else {
|
||||
console.log(`✗ ${name}: no match`);
|
||||
}
|
||||
}
|
||||
193
scripts/debug/test-top5-countries.ts
Normal file
193
scripts/debug/test-top5-countries.ts
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Quick test of top 5 priority countries
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const COUNTRIES = [
|
||||
{ code: 'FR', name: 'France' },
|
||||
{ code: 'DE', name: 'Germany' },
|
||||
{ code: 'ES', name: 'Spain' },
|
||||
{ code: 'PL', name: 'Poland' },
|
||||
{ code: 'BR', name: 'Brazil' },
|
||||
];
|
||||
|
||||
const PER_COUNTRY = 10;
|
||||
|
||||
interface CountryResult {
|
||||
country: string;
|
||||
countryName: string;
|
||||
tested: number;
|
||||
success: number;
|
||||
failed: number;
|
||||
successRate: number;
|
||||
hasBothButFailed: number; // Has days + times but parsing failed
|
||||
totalSchedules: number;
|
||||
sampleSuccess?: string;
|
||||
}
|
||||
|
||||
async function testTop5() {
|
||||
console.log('Testing top 5 priority countries (10 churches each)...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
const results: CountryResult[] = [];
|
||||
|
||||
for (const country of COUNTRIES) {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Testing ${country.name} (${country.code})`);
|
||||
console.log('='.repeat(60));
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: country.code,
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: PER_COUNTRY,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (churches.length === 0) {
|
||||
console.log(`No churches with websites found for ${country.name}\n`);
|
||||
continue;
|
||||
}
|
||||
|
||||
scraper.setCountry(country.code);
|
||||
|
||||
let success = 0;
|
||||
let failed = 0;
|
||||
let hasBothButFailed = 0;
|
||||
let totalSchedules = 0;
|
||||
let sampleSuccess: string | undefined;
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
if (result.success && result.schedules.length > 0) {
|
||||
success++;
|
||||
totalSchedules += result.schedules.length;
|
||||
process.stdout.write(`✅ ${result.schedules.length} schedules\n`);
|
||||
|
||||
if (!sampleSuccess && result.schedules.length > 0) {
|
||||
sampleSuccess = `${church.name}: ${result.schedules.length} schedules`;
|
||||
}
|
||||
} else {
|
||||
failed++;
|
||||
process.stdout.write(`❌ ${result.error}\n`);
|
||||
|
||||
// Check if has both days and times (parsing bug indicator)
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check for day names in any language
|
||||
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|terça|terca|quarta|quinta|sexta)\b/i);
|
||||
|
||||
const hasTimes = text.match(/\d{1,2}[h:\.]\s*\d{0,2}/);
|
||||
|
||||
if (hasDays && hasTimes) {
|
||||
hasBothButFailed++;
|
||||
process.stdout.write(` ⚠️ Has days + times but failed to parse\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
failed++;
|
||||
process.stdout.write(`❌ ERROR: ${err.message}\n`);
|
||||
}
|
||||
}
|
||||
|
||||
const successRate = churches.length > 0 ? (success / churches.length) * 100 : 0;
|
||||
|
||||
results.push({
|
||||
country: country.code,
|
||||
countryName: country.name,
|
||||
tested: churches.length,
|
||||
success,
|
||||
failed,
|
||||
successRate,
|
||||
hasBothButFailed,
|
||||
totalSchedules,
|
||||
sampleSuccess,
|
||||
});
|
||||
|
||||
console.log(`\n${country.name} Summary: ${success}/${churches.length} (${successRate.toFixed(0)}%)`);
|
||||
console.log(` Total schedules extracted: ${totalSchedules}`);
|
||||
if (hasBothButFailed > 0) {
|
||||
console.log(` ⚠️ Parsing bugs: ${hasBothButFailed} (has content but failed to parse)`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
// Final summary
|
||||
console.log('\n\n');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('FINAL RESULTS - TOP 5 COUNTRIES');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('');
|
||||
console.log('Country | Tested | Success | Rate | Schedules | Bugs');
|
||||
console.log('─'.repeat(80));
|
||||
|
||||
const totalTested = results.reduce((sum, r) => sum + r.tested, 0);
|
||||
const totalSuccess = results.reduce((sum, r) => sum + r.success, 0);
|
||||
const totalSchedules = results.reduce((sum, r) => sum + r.totalSchedules, 0);
|
||||
const totalBugs = results.reduce((sum, r) => sum + r.hasBothButFailed, 0);
|
||||
|
||||
results.forEach(r => {
|
||||
const country = r.countryName.padEnd(12);
|
||||
const tested = String(r.tested).padStart(6);
|
||||
const success = String(r.success).padStart(7);
|
||||
const rate = `${r.successRate.toFixed(0)}%`.padStart(5);
|
||||
const schedules = String(r.totalSchedules).padStart(9);
|
||||
const bugs = r.hasBothButFailed > 0 ? `⚠️ ${r.hasBothButFailed}` : '✓';
|
||||
|
||||
console.log(`${country} | ${tested} | ${success} | ${rate} | ${schedules} | ${bugs}`);
|
||||
});
|
||||
|
||||
console.log('─'.repeat(80));
|
||||
const avgRate = totalTested > 0 ? (totalSuccess / totalTested) * 100 : 0;
|
||||
console.log(`OVERALL | ${String(totalTested).padStart(6)} | ${String(totalSuccess).padStart(7)} | ${avgRate.toFixed(0).padStart(4)}% | ${String(totalSchedules).padStart(9)} | ${totalBugs > 0 ? `⚠️ ${totalBugs}` : '✓'}`);
|
||||
console.log('');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('');
|
||||
|
||||
if (totalBugs > 0) {
|
||||
console.log(`⚠️ ${totalBugs} parsing bugs detected (has days + times but failed)`);
|
||||
console.log(' These need investigation and fixes.\n');
|
||||
} else {
|
||||
console.log('✅ No parsing bugs! All failures are legitimate (no content or wrong page).\n');
|
||||
}
|
||||
|
||||
console.log(`Total churches tested: ${totalTested}`);
|
||||
console.log(`Total successful: ${totalSuccess} (${avgRate.toFixed(1)}%)`);
|
||||
console.log(`Total mass schedules extracted: ${totalSchedules}`);
|
||||
console.log('');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
testTop5().catch(console.error);
|
||||
173
scripts/debug/test-website-scraper.ts
Normal file
173
scripts/debug/test-website-scraper.ts
Normal file
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test website scraper on churches with websites
|
||||
* Analyzes which websites can be scraped successfully
|
||||
*/
|
||||
|
||||
// Load .env
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import fs from 'fs';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
interface TestResult {
|
||||
churchId: string;
|
||||
name: string;
|
||||
website: string;
|
||||
country: string;
|
||||
success: boolean;
|
||||
massesFound: number;
|
||||
schedules?: { dayOfWeek: number; time: string; massType?: string; language?: string }[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function normalizeUrl(url: string): string {
|
||||
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||
return `https://${url}`;
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
async function testScrapers(limit: number = 50, country?: string) {
|
||||
const results: TestResult[] = [];
|
||||
|
||||
// Get churches with websites
|
||||
const whereClause: any = {
|
||||
website: { not: null },
|
||||
};
|
||||
|
||||
if (country) {
|
||||
whereClause.country = country;
|
||||
}
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: whereClause,
|
||||
take: limit,
|
||||
orderBy: { createdAt: 'desc' },
|
||||
});
|
||||
|
||||
console.log(`Testing ${churches.length} churches with websites...\n`);
|
||||
|
||||
// Initialize the scraper (launches Playwright browser)
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
try {
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
const url = normalizeUrl(church.website!);
|
||||
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name}`);
|
||||
console.log(` Website: ${url}`);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
name: church.name,
|
||||
website: url,
|
||||
country: church.country,
|
||||
success: result.success,
|
||||
massesFound: result.schedules.length,
|
||||
schedules: result.schedules.map((s) => ({
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
massType: s.massType,
|
||||
language: s.language,
|
||||
})),
|
||||
error: result.error,
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
console.log(` ✓ ${result.schedules.length} masses found`);
|
||||
for (const s of result.schedules) {
|
||||
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||||
console.log(` ${days[s.dayOfWeek]} ${s.time} (${s.language || 'English'}${s.massType ? ', ' + s.massType : ''})`);
|
||||
}
|
||||
} else {
|
||||
console.log(` ✗ No masses found: ${result.error}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(` ✗ Error: ${error.message}`);
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
name: church.name,
|
||||
website: url,
|
||||
country: church.country,
|
||||
success: false,
|
||||
massesFound: 0,
|
||||
error: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
console.log('');
|
||||
}
|
||||
} finally {
|
||||
// Always close the browser
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
// Summary
|
||||
const successful = results.filter((r) => r.success);
|
||||
const failed = results.filter((r) => !r.success);
|
||||
const totalMasses = results.reduce((sum, r) => sum + r.massesFound, 0);
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Test Summary');
|
||||
console.log('============================================================');
|
||||
console.log(`Total churches tested: ${results.length}`);
|
||||
console.log(`Successful scrapes: ${successful.length} (${((successful.length / results.length) * 100).toFixed(1)}%)`);
|
||||
console.log(`Failed scrapes: ${failed.length} (${((failed.length / results.length) * 100).toFixed(1)}%)`);
|
||||
console.log(`Total masses found: ${totalMasses}`);
|
||||
console.log('============================================================');
|
||||
|
||||
if (failed.length > 0) {
|
||||
console.log('\nFailed websites:');
|
||||
for (const f of failed) {
|
||||
console.log(` - ${f.name}: ${f.website} (${f.error})`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('');
|
||||
|
||||
// Export results (without raw HTML to keep file manageable)
|
||||
fs.writeFileSync(
|
||||
'scraper-test-results.json',
|
||||
JSON.stringify(results, null, 2)
|
||||
);
|
||||
console.log('Results saved to scraper-test-results.json');
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const countryIndex = args.indexOf('--country');
|
||||
|
||||
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 50;
|
||||
const country = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Website Scraper Testing');
|
||||
console.log('============================================================');
|
||||
console.log(`Limit: ${limit}`);
|
||||
console.log(`Country: ${country || 'All'}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
await testScrapers(limit, country);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
53
scripts/debug/verify-paz-schedules.ts
Normal file
53
scripts/debug/verify-paz-schedules.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Verify Paróquia da Paz schedules are correctly parsed
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function verifyPazSchedules() {
|
||||
const url = 'https://www.paroquiadapaz.org.br/';
|
||||
console.log(`Verifying: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('BR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`✅ Success: ${result.success}`);
|
||||
console.log(`📅 Schedules found: ${result.schedules.length}\n`);
|
||||
|
||||
// Group by day
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
|
||||
|
||||
console.log('═══════════════════════════════════════════════');
|
||||
console.log('PARSED SCHEDULE:');
|
||||
console.log('═══════════════════════════════════════════════\n');
|
||||
|
||||
Object.entries(byDay)
|
||||
.sort(([a], [b]) => parseInt(a) - parseInt(b))
|
||||
.forEach(([day, scheds]) => {
|
||||
console.log(`${dayNames[parseInt(day)]}:`);
|
||||
scheds.forEach(s => {
|
||||
console.log(` ${s.time} - ${s.language} ${s.massType}`);
|
||||
});
|
||||
console.log('');
|
||||
});
|
||||
|
||||
console.log('Expected schedule (from website):');
|
||||
console.log('Segunda, Terça, Quarta, Sexta: 16:00 e 18:00');
|
||||
console.log('Quinta: 16:00 e 19:00');
|
||||
console.log('Sábado: 08:00, 16:00 e 18:00');
|
||||
console.log('Domingo: 08:00, 11:00, 16:00, 18:00 e 20:00');
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
verifyPazSchedules().catch(console.error);
|
||||
Reference in New Issue
Block a user