Files
ScraperControl/scripts/debug/pipeline-report.js

168 lines
5.7 KiB
JavaScript
Raw Normal View History

const { Client } = require("pg");
const client = new Client({
connectionString: "postgresql://postgres:postgres@192.168.0.145:5434/nearestmass"
});
const queries = [
{
name: "1. Overall church counts by country (top 20)",
sql: `SELECT country, COUNT(*) as total,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped,
COUNT(*) FILTER (WHERE has_website = true) as has_website_flag,
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_language
FROM churches
GROUP BY country
ORDER BY total DESC
LIMIT 20`
},
{
name: "2. Total mass schedule counts",
sql: `SELECT COUNT(*) as total_schedules,
COUNT(DISTINCT church_id) as churches_with_schedules
FROM mass_schedules`
},
{
name: "3. Scrape results by language",
sql: `SELECT website_language as language,
COUNT(*) as total_scraped,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
FROM churches
WHERE website_language IS NOT NULL
GROUP BY website_language
ORDER BY total_scraped DESC`
},
{
name: "4. Churches with websites but never scraped",
sql: `SELECT COUNT(*) as has_website_not_scraped
FROM churches
WHERE website IS NOT NULL AND last_scraped_at IS NULL`
},
{
name: "5. Overall pipeline funnel",
sql: `SELECT
COUNT(*) as total_churches,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as attempted_scrape,
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_detected_language,
(SELECT COUNT(DISTINCT church_id) FROM mass_schedules) as has_schedules_saved,
(SELECT COUNT(*) FROM mass_schedules) as total_schedule_rows
FROM churches`
},
{
name: "6. Recent scrape activity (last 7 days) by language",
sql: `SELECT website_language as language,
COUNT(*) as scraped_last_7d
FROM churches
WHERE last_scraped_at > NOW() - INTERVAL '7 days'
GROUP BY website_language
ORDER BY scraped_last_7d DESC`
},
{
name: "7. Background job history (last 15 completed/failed jobs)",
sql: `SELECT type, language, status,
created_at::date as created,
completed_at::date as completed,
ROUND(CAST(EXTRACT(EPOCH FROM (completed_at - created_at))/3600 AS numeric), 2) as hours,
total_items, processed, succeeded, failed
FROM background_jobs
WHERE status IN ('completed', 'failed')
ORDER BY completed_at DESC
LIMIT 15`
},
{
name: "8. Mass schedule breakdown by day of week",
sql: `SELECT day_of_week,
CASE day_of_week
WHEN 0 THEN 'Sunday' WHEN 1 THEN 'Monday' WHEN 2 THEN 'Tuesday'
WHEN 3 THEN 'Wednesday' WHEN 4 THEN 'Thursday' WHEN 5 THEN 'Friday'
WHEN 6 THEN 'Saturday' ELSE 'Other'
END as day_name,
COUNT(*) as count
FROM mass_schedules
GROUP BY day_of_week
ORDER BY day_of_week`
},
{
name: "9. Churches with schedules by country (top 15)",
sql: `SELECT c.country,
COUNT(DISTINCT c.id) as total_churches,
COUNT(DISTINCT ms.church_id) as churches_with_schedules,
ROUND(100.0 * COUNT(DISTINCT ms.church_id) / NULLIF(COUNT(DISTINCT c.id), 0), 1) as coverage_pct,
COUNT(ms.id) as total_schedule_rows
FROM churches c
LEFT JOIN mass_schedules ms ON ms.church_id = c.id
GROUP BY c.country
ORDER BY total_churches DESC
LIMIT 15`
},
{
name: "10. Enrichment sources - how churches were found",
sql: `SELECT source, COUNT(*) as count
FROM churches
GROUP BY source
ORDER BY count DESC`
},
{
name: "11. Google Places enrichment impact",
sql: `SELECT
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL AND website IS NOT NULL) as google_with_website,
COUNT(*) FILTER (WHERE google_place_id IS NULL) as no_google_place,
COUNT(*) FILTER (WHERE google_searched_at IS NOT NULL) as google_searched,
COUNT(*) FILTER (WHERE free_searched_at IS NOT NULL) as free_searched
FROM churches`
},
{
name: "12. Website presence by source",
sql: `SELECT source,
COUNT(*) as total,
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
ROUND(100.0 * COUNT(*) FILTER (WHERE website IS NOT NULL) / NULLIF(COUNT(*), 0), 1) as website_pct,
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
FROM churches
GROUP BY source
ORDER BY total DESC`
}
];
async function run() {
await client.connect();
for (const q of queries) {
console.log("=".repeat(90));
console.log(q.name);
console.log("=".repeat(90));
try {
const res = await client.query(q.sql);
if (res.rows.length === 0) {
console.log("(no rows returned)");
} else {
// Calculate column widths
const cols = Object.keys(res.rows[0]);
const widths = cols.map(c => {
const maxData = Math.max(...res.rows.map(r => String(r[c] ?? "NULL").length));
return Math.max(c.length, maxData);
});
// Print header
console.log(cols.map((c, i) => c.padEnd(widths[i])).join(" | "));
console.log(widths.map(w => "-".repeat(w)).join("-+-"));
// Print rows
for (const row of res.rows) {
console.log(cols.map((c, i) => String(row[c] ?? "NULL").padEnd(widths[i])).join(" | "));
}
}
console.log("\n(" + res.rows.length + " rows)\n");
} catch (err) {
console.log("ERROR:", err.message, "\n");
}
}
await client.end();
}
run().catch(e => { console.error(e); process.exit(1); });