From b93c7808a438ca5beb4455e1f8adf90805e42469 Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Mon, 16 Mar 2026 23:35:10 -0400 Subject: [PATCH] docs: add design spec for BuscarMisas network importer (BR/MX/AR/CO/CL) --- ...-16-buscarmisas-network-importer-design.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/superpowers/specs/2026-03-16-buscarmisas-network-importer-design.md diff --git a/docs/superpowers/specs/2026-03-16-buscarmisas-network-importer-design.md b/docs/superpowers/specs/2026-03-16-buscarmisas-network-importer-design.md new file mode 100644 index 0000000..dee1352 --- /dev/null +++ b/docs/superpowers/specs/2026-03-16-buscarmisas-network-importer-design.md @@ -0,0 +1,135 @@ +# BuscarMisas Network Importer — Design Spec + +**Date:** 2026-03-16 +**Status:** Approved + +--- + +## Overview + +Add a single importer `scripts/import-buscarmisas-network.ts` that scrapes church data and mass schedules from a network of 5 identical WordPress-based Catholic mass-time directories covering 5 Latin American countries (~15,294 churches total). + +--- + +## Network Sites + +| Domain | Country | Churches | Language | Sitemap Type | +|--------|---------|----------|----------|--------------| +| `horariosmissa.com.br` | BR (Brazil) | ~4,732 | Portuguese | `page-sitemap*.xml` | +| `buscarmisas.com.mx` | MX (Mexico) | ~3,950 | Spanish | `page-sitemap*.xml` | +| `horariosmisa.com.ar` | AR (Argentina) | ~3,012 | Spanish | `page-sitemap*.xml` | +| `buscarmisas.co` | CO (Colombia) | ~2,665 | Spanish | `page-sitemap*.xml` | +| `horariomisa.cl` | CL (Chile) | ~935 | Spanish | `post-sitemap.xml` | + +--- + +## Architecture + +### Config Map + +```ts +interface SiteConfig { + country: string; // ISO 3166-1 alpha-2 + language: 'pt' | 'es'; + sitemapType: 'page' | 'post'; +} + +const NETWORK_SITES: Record = { + 'horariosmissa.com.br': { country: 'BR', language: 'pt', sitemapType: 'page' }, + 'buscarmisas.com.mx': { country: 'MX', language: 'es', sitemapType: 'page' }, + 'horariosmisa.com.ar': { country: 'AR', language: 'es', sitemapType: 'page' }, + 'buscarmisas.co': { country: 'CO', language: 'es', sitemapType: 'page' }, + 'horariomisa.cl': { country: 'CL', language: 'es', sitemapType: 'post' }, +}; +``` + +### CLI Interface + +```bash +# Single domain +npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br + +# All domains sequentially +npx tsx scripts/import-buscarmisas-network.ts --all + +# Resume after interruption +npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --resume-from 1200 + +# Dry run (no DB writes) +npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --dry-run +``` + +Source slug stored in DB: domain with dots replaced by dashes, e.g. `horariosmissa-com-br`. + +--- + +## Data Flow + +### 1. Sitemap Discovery + +- Fetch `https://{domain}/sitemap_index.xml` +- Extract child sitemap URLs +- For `sitemapType: 'page'`: collect all `page-sitemap*.xml` URLs +- For `sitemapType: 'post'`: collect `post-sitemap.xml` +- Fetch each child sitemap, filter to 3-segment church URLs (path depth = `/{region}/{city}/{church-slug}/`) +- Collect deduplicated list of church page URLs + +### 2. Church Page Parsing + +For each church URL, fetch the HTML and extract: + +| Field | Source | +|-------|--------| +| Name | Table cell after `Nome` (PT) or `Nombre` (ES) | +| Address | Table cell after `Endereço` (PT) or `Dirección` (ES) | +| Phone | `href="tel:..."` anchor | +| Latitude/Longitude | Google Maps iframe `src` — `center={lat}%2C{lng}` parameter | +| Country | From `SiteConfig.country` | +| State/Region | 1st path segment of church URL | +| City | 2nd path segment of church URL | +| Mass schedule | Mon–Sun table rows: day name → time string (skip `-` entries) | + +### 3. Schedule Parsing + +- Day names resolved via existing `src/scrapers/i18n/day-names.ts` patterns +- Portuguese: `Segunda-feira`, `Terça-feira`, `Quarta-feira`, `Quinta-feira`, `Sexta-feira`, `Sábado`, `Domingo` +- Spanish: `Lunes`, `Martes`, `Miércoles`, `Jueves`, `Viernes`, `Sábado`, `Domingo` +- Times are comma-separated within a single cell (e.g. `10:00, 18:00`) — split and create one schedule entry per time +- `-` entries indicate no mass that day — skip + +### 4. Upsert to Database + +- Use `findDuplicateChurch` (existing church-matcher) to check for duplicates before insert +- Upsert church with `source = {domain-slug}` +- Upsert mass schedules linked to church +- Track progress: log every 100 churches, support `--resume-from {index}` + +### 5. Rate Limiting + +- 2-second delay between church page fetches (same as `import-discovermass.ts`) +- 5-second delay between domains when running `--all` +- Respect HTTP 429 / 503 with exponential backoff (up to 3 retries) + +--- + +## Error Handling + +- Skip churches where lat/lng cannot be extracted (log warning, continue) +- Skip churches where name is empty +- On fetch error: log and continue to next URL (don't abort the run) +- On DB error: log and continue + +--- + +## Integration + +- Add `import:buscarmisas-network` script to `package.json` +- Add to scheduler pipeline alongside other importers +- No new dependencies required + +--- + +## Out of Scope + +- The `horairemesses.ch` (Switzerland), `gottesdienstheute.de` (Germany), and `masstime.co.uk` (UK) network sites are excluded — those countries already have dedicated importers +- Chile's `page-sitemap.xml` contains only city pages (not churches) — only `post-sitemap.xml` is used for Chile