86 lines
2.8 KiB
TypeScript
86 lines
2.8 KiB
TypeScript
|
|
#!/usr/bin/env tsx
|
||
|
|
/**
|
||
|
|
* Debug Paróquia da Paz with added logging
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||
|
|
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||
|
|
|
||
|
|
async function debugPazWithLogging() {
|
||
|
|
const url = 'https://www.paroquiadapaz.org.br/';
|
||
|
|
console.log(`Debugging: ${url}\n`);
|
||
|
|
|
||
|
|
const scraper = new GenericScraper();
|
||
|
|
await scraper.init();
|
||
|
|
scraper.setCountry('BR');
|
||
|
|
|
||
|
|
const result = await scraper.scrape(url);
|
||
|
|
|
||
|
|
console.log(`Success: ${result.success}`);
|
||
|
|
console.log(`Schedules: ${result.schedules.length}\n`);
|
||
|
|
|
||
|
|
if (result.rawHtml) {
|
||
|
|
const text = result.rawHtml
|
||
|
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||
|
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||
|
|
.replace(/<[^>]+>/g, ' ')
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.toLowerCase();
|
||
|
|
|
||
|
|
// Test the regex pattern manually
|
||
|
|
console.log('=== Testing comma-separated day grouping regex ===\n');
|
||
|
|
|
||
|
|
const dayConfigs = getDayNamesForCountry('BR');
|
||
|
|
const dayPatterns = buildDayPatterns(dayConfigs);
|
||
|
|
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||
|
|
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||
|
|
|
||
|
|
console.log('Day patterns:', Object.keys(dayPatterns).join(', '));
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
// The exact regex from the code
|
||
|
|
const dayGroupRegex = new RegExp(
|
||
|
|
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
|
||
|
|
'gi'
|
||
|
|
);
|
||
|
|
|
||
|
|
console.log('Regex pattern:', dayGroupRegex.source.substring(0, 200) + '...\n');
|
||
|
|
|
||
|
|
let groupMatch;
|
||
|
|
let matchCount = 0;
|
||
|
|
while ((groupMatch = dayGroupRegex.exec(text)) !== null) {
|
||
|
|
matchCount++;
|
||
|
|
console.log(`Match #${matchCount}:`);
|
||
|
|
console.log(` Full match: "${groupMatch[0].substring(0, 100)}"`);
|
||
|
|
console.log(` Day group: "${groupMatch[1]}"`);
|
||
|
|
console.log(` Time text: "${groupMatch[2].substring(0, 50)}"`);
|
||
|
|
console.log('');
|
||
|
|
}
|
||
|
|
|
||
|
|
if (matchCount === 0) {
|
||
|
|
console.log('No matches found!\n');
|
||
|
|
|
||
|
|
// Try to find the schedule text manually
|
||
|
|
const scheduleIndex = text.indexOf('segundas, terças');
|
||
|
|
if (scheduleIndex !== -1) {
|
||
|
|
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
|
||
|
|
console.log('Found schedule text at position', scheduleIndex);
|
||
|
|
console.log('Snippet:', snippet);
|
||
|
|
console.log('');
|
||
|
|
|
||
|
|
// Test if individual day names are matching
|
||
|
|
console.log('Testing individual day name matches in snippet:');
|
||
|
|
for (const dayName of sortedDayNames.slice(0, 10)) {
|
||
|
|
if (snippet.includes(dayName)) {
|
||
|
|
console.log(` ✓ Found: ${dayName}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
await scraper.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
debugPazWithLogging().catch(console.error);
|