From 0e468bcb94d3b59a595fa9e3471d2b2832fb4341 Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Tue, 10 Mar 2026 19:50:54 -0400 Subject: [PATCH] docs: add Brazil + Spain importers design spec and implementation plan Two new importers: - horariodemissa.com.br: 8,895 Brazilian churches + 28,523 mass times - misas.org: 17,919 Spanish churches with coordinates Co-Authored-By: Claude Sonnet 4.6 --- .../2026-03-10-brazil-spain-importers.md | 1383 +++++++++++++++++ ...026-03-10-brazil-spain-importers-design.md | 192 +++ 2 files changed, 1575 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-10-brazil-spain-importers.md create mode 100644 docs/superpowers/specs/2026-03-10-brazil-spain-importers-design.md diff --git a/docs/superpowers/plans/2026-03-10-brazil-spain-importers.md b/docs/superpowers/plans/2026-03-10-brazil-spain-importers.md new file mode 100644 index 0000000..51958b5 --- /dev/null +++ b/docs/superpowers/plans/2026-03-10-brazil-spain-importers.md @@ -0,0 +1,1383 @@ +# Brazil + Spain Importers Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add two new church importers — horariodemissa.com.br (8,895 Brazilian churches + 28,523 mass times) and misas.org (17,919 Spanish churches with coordinates). + +**Architecture:** Chunk 1 (shared prerequisites) must complete first. Tasks 3–5 (Brazil) and Tasks 6–7 (Spain) are independent and can run in parallel as subagents. All scripts follow the established importer pattern: fetch → regex parse → church-matcher dedup → prisma upsert. + +**Tech Stack:** TypeScript, tsx, native `fetch`, regex HTML parsing (matchAll), Prisma + pg, church-matcher + +**Spec:** `docs/superpowers/specs/2026-03-10-brazil-spain-importers-design.md` + +--- + +## Chunk 1: Shared Prerequisites (schema + church-matcher) + +### Task 1: Schema additions + +**Files:** +- Modify: `prisma/schema.prisma` + +- [ ] **Step 1: Add two new ID fields to the Church model** + +In `prisma/schema.prisma`, find the block of importer ID fields (near `gottesdienstzeitenId`) and add after it: + +```prisma +horarioDemissaId String? @unique @map("horario_demissa_id") +misasOrgId String? @unique @map("misas_org_id") +``` + +Then add two indexes in the `@@index` block at the bottom of the Church model: + +```prisma +@@index([horarioDemissaId]) +@@index([misasOrgId]) +``` + +- [ ] **Step 2: Regenerate Prisma client** + +```bash +npx prisma generate +``` + +Expected: `✔ Generated Prisma Client` with no errors. + +- [ ] **Step 3: Verify the fields exist in generated types** + +```bash +grep -n "horarioDemissaId\|misasOrgId" node_modules/.prisma/client/index.d.ts | head -10 +``` + +Expected: both fields appear in the type definitions. + +- [ ] **Step 4: Commit** + +```bash +git add prisma/schema.prisma +git commit -m "feat: add horarioDemissaId and misasOrgId fields to Church schema" +``` + +--- + +### Task 2: church-matcher updates + +**Files:** +- Modify: `src/lib/church-matcher.ts` + +- [ ] **Step 1: Add new fields to ExistingChurch interface** + +In `src/lib/church-matcher.ts`, find `ExistingChurch` interface and add after `gottesdienstzeitenId`: + +```typescript +horarioDemissaId: string | null; +misasOrgId: string | null; +``` + +- [ ] **Step 2: Add new fields to ChurchCandidate type** + +Find `ChurchCandidate` type and add after `gottesdienstzeitenId?`: + +```typescript +horarioDemissaId?: string; +misasOrgId?: string; +``` + +- [ ] **Step 3: Add two new exact-match passes in findDuplicateChurch** + +After the Thirteenth pass (gottesdienstzeitenId), add before the proximity pass: + +```typescript + // Fourteenth pass: exact horarioDemissaId match + if (candidate.horarioDemissaId) { + const match = existingChurches.find( + (church) => church.horarioDemissaId === candidate.horarioDemissaId + ); + if (match) return match; + } + + // Fifteenth pass: exact misasOrgId match + if (candidate.misasOrgId) { + const match = existingChurches.find( + (church) => church.misasOrgId === candidate.misasOrgId + ); + if (match) return match; + } +``` + +- [ ] **Step 4: Verify TypeScript compiles** + +```bash +npx tsc --noEmit +``` + +Expected: no errors. + +- [ ] **Step 5: Commit** + +```bash +git add src/lib/church-matcher.ts +git commit -m "feat: add horarioDemissaId and misasOrgId to church-matcher" +``` + +--- + +## Chunk 2: Brazil Importer (import-horariodemissa.ts) + +> Depends on Chunk 1. Can run in parallel with Chunk 3. + +### Task 3: Boilerplate + sitemap enumeration + +**Files:** +- Create: `scripts/import-horariodemissa.ts` + +- [ ] **Step 1: Create script with boilerplate + types + sitemap parsing** + +Create `scripts/import-horariodemissa.ts`: + +```typescript +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from horariodemissa.com.br (Brazil) + * + * horariodemissa.com.br has 8,895 churches across all 26 Brazilian states + DF, + * with 28,523 mass times. All data is server-rendered — one HTTP request per city + * page returns all churches + schedules for that city. + * + * City pages have a split structure: + * - Address/phone: embedded in JS h.push() strings (sidebar/map data) + * - Schedules: in server-rendered .result divs with rows + * Both sets are linked by the same church key (e.g. "dvey2"). + * + * Import strategy: + * 1. Fetch sitemap.xml → deduplicate to pt-only city URLs (~3,552 cities) + * 2. For each city: fetch page → parse address/phone from JS + schedules from DOM + * 3. Join by church key, match against existing BR churches, upsert + * 4. Optional --geocode flag for Nominatim pass after import + * + * Usage: + * npx tsx scripts/import-horariodemissa.ts --all + * npx tsx scripts/import-horariodemissa.ts --all --dry-run + * npx tsx scripts/import-horariodemissa.ts --state SP + * npx tsx scripts/import-horariodemissa.ts --all --resume-from 500 + * npx tsx scripts/import-horariodemissa.ts --all --geocode + * npx tsx scripts/import-horariodemissa.ts --geocode-only + * npx tsx scripts/import-horariodemissa.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SITE_BASE = 'https://horariodemissa.com.br'; +const SITEMAP_URL = `${SITE_BASE}/sitemap.xml`; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 1500; +const NOMINATIM_DELAY_MS = 1100; +const NOMINATIM_URL = 'https://nominatim.openstreetmap.org/search'; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface CityUrl { + state: string; // e.g. "SP" + city: string; // e.g. "São Paulo" + url: string; // full fetch URL +} + +interface ParsedSchedule { + dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat + time: string; // "HH:MM" + notes: string | null; +} + +interface ParsedConfession { + dayOfWeek: number; + startTime: string; + endTime: string; + notes: string | null; +} + +interface ParsedChurch { + key: string; // e.g. "dvey2" (used as horarioDemissaId) + name: string; + address: string | null; + phone: string | null; + city: string; + state: string; + massSchedules: ParsedSchedule[]; + confessionSchedules: ParsedConfession[]; +} + +interface CLIArgs { + all: boolean; + state?: string; + dryRun: boolean; + geocode: boolean; + geocodeOnly: boolean; + resumeFrom?: number; + jobId?: string; +} + +interface ImportStats { + citiesProcessed: number; + churchesFound: number; + churchesCreated: number; + churchesUpdated: number; + massSchedulesCreated: number; + geocoded: number; + geocodeFailed: number; + errors: number; +} + +// ─── Brazilian Day Name Mapping ─────────────────────────────────────────────── + +const DAY_MAP: Record = { + 'domingo': 0, + 'segunda-feira': 1, 'segunda': 1, + 'terça-feira': 2, 'terca-feira': 2, 'terça': 2, + 'quarta-feira': 3, 'quarta': 3, + 'quinta-feira': 4, 'quinta': 4, + 'sexta-feira': 5, 'sexta': 5, + 'sábado': 6, 'sabado': 6, +}; + +const SPECIAL_DAY_MAP: Record = { + 'primeiro domingo': { dayOfWeek: 0, notes: 'Primeiro Domingo' }, + 'segundo domingo': { dayOfWeek: 0, notes: 'Segundo Domingo' }, + 'terceiro domingo': { dayOfWeek: 0, notes: 'Terceiro Domingo' }, + 'quarto domingo': { dayOfWeek: 0, notes: 'Quarto Domingo' }, + 'primeiro sábado': { dayOfWeek: 6, notes: 'Primeiro Sábado' }, + 'primeiro sabado': { dayOfWeek: 6, notes: 'Primeiro Sábado' }, + 'segundo sábado': { dayOfWeek: 6, notes: 'Segundo Sábado' }, + 'segundo sabado': { dayOfWeek: 6, notes: 'Segundo Sábado' }, +}; + +// ─── HTTP Client ────────────────────────────────────────────────────────────── + +let requestCount = 0; + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise { + if (requestCount > 0) await delay(delayMs); + requestCount++; + + try { + const response = await fetch(url, { + headers: { + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'pt-BR,pt;q=0.9', + }, + }); + + if (!response.ok) { + console.error(` HTTP ${response.status} for ${url}`); + return null; + } + + return await response.text(); + } catch (error) { + console.error(` Fetch error for ${url}: ${error instanceof Error ? error.message : error}`); + return null; + } +} + +// ─── Sitemap Parser ─────────────────────────────────────────────────────────── + +export function parseCityUrlsFromSitemap(sitemapXml: string, filterState?: string): CityUrl[] { + const seen = new Set(); + const cities: CityUrl[] = []; + + for (const match of sitemapXml.matchAll(/([^<]+)<\/loc>/g)) { + const rawUrl = match[1].replace(/&/g, '&'); + + // Only pt-language city search pages + if (!rawUrl.includes('opcoes=cidade_opcoes') || rawUrl.includes('hl=en')) continue; + + const ufMatch = rawUrl.match(/[?&]uf=([A-Z]+)/); + const cidadeMatch = rawUrl.match(/[?&]cidade=([^&]+)/); + if (!ufMatch || !cidadeMatch) continue; + + const state = ufMatch[1]; + const city = decodeURIComponent(cidadeMatch[1].replace(/\+/g, ' ')); + + if (filterState && state !== filterState.toUpperCase()) continue; + + const key = `${state}:${city}`; + if (seen.has(key)) continue; + seen.add(key); + + cities.push({ state, city, url: rawUrl }); + } + + cities.sort((a, b) => a.state.localeCompare(b.state) || a.city.localeCompare(b.city)); + return cities; +} + +async function fetchCityUrls(filterState?: string): Promise { + console.log(`Fetching sitemap: ${SITEMAP_URL}`); + const xml = await fetchPage(SITEMAP_URL); + if (!xml) throw new Error('Failed to fetch sitemap'); + + const cities = parseCityUrlsFromSitemap(xml, filterState); + console.log(`Found ${cities.length} unique cities${filterState ? ` in ${filterState}` : ''}`); + return cities; +} +``` + +- [ ] **Step 2: Verify sitemap parsing works** + +```bash +npx tsx -e " +import dotenv from 'dotenv'; +dotenv.config({ path: '.env' }); +const { parseCityUrlsFromSitemap } = await import('./scripts/import-horariodemissa.ts'); +const xml = await fetch('https://horariodemissa.com.br/sitemap.xml').then(r => r.text()); +const cities = parseCityUrlsFromSitemap(xml); +console.log('Total cities:', cities.length); +console.log('Sample:', JSON.stringify(cities.slice(0, 3), null, 2)); +const states = [...new Set(cities.map(c => c.state))].sort(); +console.log('States:', states.join(', ')); +" +``` + +Expected: ~3,500 cities, states include SP, RJ, MG, RS, BA, DF, etc. + +- [ ] **Step 3: Commit** + +```bash +git add scripts/import-horariodemissa.ts +git commit -m "feat: horariodemissa importer scaffold + sitemap enumeration" +``` + +--- + +### Task 4: HTML parsing + +**Files:** +- Modify: `scripts/import-horariodemissa.ts` + +- [ ] **Step 1: Understand the dual-source page structure** + +Each city page contains two data sources per church, joined by the same key (e.g. `dvey2`): + +**Source A** — JS `h.push()` strings embedded in `