chore: sync with Gitea master and restore local-only files

Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00
parent 76cca3ba75
commit 2c51513851
133 changed files with 30381 additions and 0 deletions
--- a/scripts/debug/debug-paz-with-logging.ts
+++ b/scripts/debug/debug-paz-with-logging.ts
@@ -0,0 +1,85 @@
+#!/usr/bin/env tsx
+/**
+ * Debug Paróquia da Paz with added logging
+ */
+
+import { GenericScraper } from '../../src/scrapers/strategies/generic';
+import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
+
+async function debugPazWithLogging() {
+  const url = 'https://www.paroquiadapaz.org.br/';
+  console.log(`Debugging: ${url}\n`);
+
+  const scraper = new GenericScraper();
+  await scraper.init();
+  scraper.setCountry('BR');
+
+  const result = await scraper.scrape(url);
+
+  console.log(`Success: ${result.success}`);
+  console.log(`Schedules: ${result.schedules.length}\n`);
+
+  if (result.rawHtml) {
+    const text = result.rawHtml
+      .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
+      .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
+      .replace(/<[^>]+>/g, ' ')
+      .replace(/\s+/g, ' ')
+      .toLowerCase();
+
+    // Test the regex pattern manually
+    console.log('=== Testing comma-separated day grouping regex ===\n');
+
+    const dayConfigs = getDayNamesForCountry('BR');
+    const dayPatterns = buildDayPatterns(dayConfigs);
+    const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
+    const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
+
+    console.log('Day patterns:', Object.keys(dayPatterns).join(', '));
+    console.log('');
+
+    // The exact regex from the code
+    const dayGroupRegex = new RegExp(
+      `((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
+      'gi'
+    );
+
+    console.log('Regex pattern:', dayGroupRegex.source.substring(0, 200) + '...\n');
+
+    let groupMatch;
+    let matchCount = 0;
+    while ((groupMatch = dayGroupRegex.exec(text)) !== null) {
+      matchCount++;
+      console.log(`Match #${matchCount}:`);
+      console.log(`  Full match: "${groupMatch[0].substring(0, 100)}"`);
+      console.log(`  Day group: "${groupMatch[1]}"`);
+      console.log(`  Time text: "${groupMatch[2].substring(0, 50)}"`);
+      console.log('');
+    }
+
+    if (matchCount === 0) {
+      console.log('No matches found!\n');
+
+      // Try to find the schedule text manually
+      const scheduleIndex = text.indexOf('segundas, terças');
+      if (scheduleIndex !== -1) {
+        const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
+        console.log('Found schedule text at position', scheduleIndex);
+        console.log('Snippet:', snippet);
+        console.log('');
+
+        // Test if individual day names are matching
+        console.log('Testing individual day name matches in snippet:');
+        for (const dayName of sortedDayNames.slice(0, 10)) {
+          if (snippet.includes(dayName)) {
+            console.log(`  ✓ Found: ${dayName}`);
+          }
+        }
+      }
+    }
+  }
+
+  await scraper.close();
+}
+
+debugPazWithLogging().catch(console.error);