ScraperControl/scripts/debug/debug-paz-full-flow.ts

#!/usr/bin/env tsx
/**
 * Debug the full parsing flow with section detection
 */

import { GenericScraper } from '../../src/scrapers/strategies/generic';
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';

async function debugFullFlow() {
  const url = 'https://www.paroquiadapaz.org.br/';
  console.log(`Debugging: ${url}\n`);

  const scraper = new GenericScraper();
  await scraper.init();
  scraper.setCountry('BR');

  const result = await scraper.scrape(url);

  if (!result.rawHtml) {
    console.log('No HTML received');
    await scraper.close();
    return;
  }

  const text = result.rawHtml
    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
    .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
    .replace(/<[^>]+>/g, ' ')
    .replace(/\s+/g, ' ')
    .toLowerCase();

  // Find the schedule section
  const scheduleIndex = text.indexOf('segundas, terças');
  if (scheduleIndex === -1) {
    console.log('Schedule text not found!');
    await scraper.close();
    return;
  }

  const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
  console.log('Schedule snippet from actual HTML:');
  console.log(snippet);
  console.log('\n');

  // Now test section matching on actual text
  const dayConfigs = getDayNamesForCountry('BR');
  const dayPatterns = buildDayPatterns(dayConfigs);
  const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
  const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');

  console.log('=== Testing sábados and domingos matches ===\n');

  // Test sábados
  const sabadosRegex = new RegExp(
    `(?:^|\\s|[,;:])sábados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
    'i'
  );
  const sabadosMatch = snippet.match(sabadosRegex);
  console.log('sábados match:', sabadosMatch ? `Found: "${sabadosMatch[1].substring(0, 50)}"` : 'Not found');

  // Test sabados (no accent)
  const sabadosRegex2 = new RegExp(
    `(?:^|\\s|[,;:])sabados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
    'i'
  );
  const sabadosMatch2 = snippet.match(sabadosRegex2);
  console.log('sabados match:', sabadosMatch2 ? `Found: "${sabadosMatch2[1].substring(0, 50)}"` : 'Not found');

  // Test domingos
  const domingosRegex = new RegExp(
    `(?:^|\\s|[,;:])domingos[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
    'i'
  );
  const domingosMatch = snippet.match(domingosRegex);
  console.log('domingos match:', domingosMatch ? `Found: "${domingosMatch[1].substring(0, 50)}"` : 'Not found');

  console.log('\n=== Final parsed schedules ===\n');
  console.log(`Total: ${result.schedules.length}`);

  const byDay: Record<number, typeof result.schedules> = {};
  for (const sched of result.schedules) {
    if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
    byDay[sched.dayOfWeek].push(sched);
  }

  const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
  for (let i = 0; i < 7; i++) {
    if (byDay[i]) {
      console.log(`${dayNames[i]}: ${byDay[i].length} schedules`);
    } else {
      console.log(`${dayNames[i]}: 0 schedules ❌`);
    }
  }

  await scraper.close();
}

debugFullFlow().catch(console.error);