From c4ce4749442fda91d0862b8996e034f26bf715f0 Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Mon, 16 Mar 2026 23:40:53 -0400 Subject: [PATCH] docs: finalize BuscarMisas network importer spec (all review issues resolved) --- ...-16-buscarmisas-network-importer-design.md | 101 +++++++++++++----- 1 file changed, 75 insertions(+), 26 deletions(-) diff --git a/docs/superpowers/specs/2026-03-16-buscarmisas-network-importer-design.md b/docs/superpowers/specs/2026-03-16-buscarmisas-network-importer-design.md index dee1352..eb461f3 100644 --- a/docs/superpowers/specs/2026-03-16-buscarmisas-network-importer-design.md +++ b/docs/superpowers/specs/2026-03-16-buscarmisas-network-importer-design.md @@ -23,6 +23,26 @@ Add a single importer `scripts/import-buscarmisas-network.ts` that scrapes churc --- +## Schema Migration (prerequisite) + +A new column must be added in **BethelGuide** (schema source of truth) before implementation: + +```prisma +buscarmisasNetworkId String? @unique @map("buscamissas_network_id") +@@index([buscarmisasNetworkId]) +``` + +After merging the migration in BethelGuide, copy the updated `schema.prisma` to ScraperControl and run `npx prisma generate`. + +The external ID format is `{domain-slug}/{church-slug}`, e.g.: +`horariosmissa-com-br/paroquia-nossa-senhora-dos-remedios` + +where `domain-slug` replaces `.` with `-`, and `church-slug` is the final path segment of the church URL. + +`church-matcher.ts` and the `ExistingChurch` / `ChurchCandidate` interfaces must be updated to include `buscarmisasNetworkId` alongside the existing external ID fields, with a corresponding ID-match pass in `findDuplicateChurch()`. + +--- + ## Architecture ### Config Map @@ -49,17 +69,19 @@ const NETWORK_SITES: Record = { # Single domain npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br -# All domains sequentially -npx tsx scripts/import-buscarmisas-network.ts --all - -# Resume after interruption +# Single domain with resume (--resume-from only valid with --domain) npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --resume-from 1200 +# All domains sequentially (no --resume-from; use --domain for resuming individual runs) +npx tsx scripts/import-buscarmisas-network.ts --all + # Dry run (no DB writes) npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br --dry-run ``` -Source slug stored in DB: domain with dots replaced by dashes, e.g. `horariosmissa-com-br`. +**Validation:** If `--domain` is provided but not present in `NETWORK_SITES`, exit immediately with a clear error message listing valid domains. `--resume-from` combined with `--all` is also an error — exit with usage message. + +Source slug stored in DB `source` field: `buscarmisas-network` (same value for all domains — the `buscarmisasNetworkId` distinguishes per-church). --- @@ -69,9 +91,9 @@ Source slug stored in DB: domain with dots replaced by dashes, e.g. `horariosmis - Fetch `https://{domain}/sitemap_index.xml` - Extract child sitemap URLs -- For `sitemapType: 'page'`: collect all `page-sitemap*.xml` URLs -- For `sitemapType: 'post'`: collect `post-sitemap.xml` -- Fetch each child sitemap, filter to 3-segment church URLs (path depth = `/{region}/{city}/{church-slug}/`) +- For `sitemapType: 'page'`: collect all `page-sitemap*.xml` URLs (ignore `post-sitemap*.xml` and `page-sitemap.xml` city-only entries for Chile) +- For `sitemapType: 'post'`: collect `post-sitemap.xml` only +- Fetch each child sitemap, filter to 3-segment church URLs (path segments = `/{region}/{city}/{church-slug}/`) - Collect deduplicated list of church page URLs ### 2. Church Page Parsing @@ -83,49 +105,76 @@ For each church URL, fetch the HTML and extract: | Name | Table cell after `Nome` (PT) or `Nombre` (ES) | | Address | Table cell after `Endereço` (PT) or `Dirección` (ES) | | Phone | `href="tel:..."` anchor | -| Latitude/Longitude | Google Maps iframe `src` — `center={lat}%2C{lng}` parameter | +| Latitude/Longitude | Google Maps iframe `src` — `center={lat}%2C{lng}` parameter (confirmed present on all 5 network sites; same API key `AIzaSyCNTEOso0tZG6YMSJFoaJEY5Th1stEWrJI` used across the network) | | Country | From `SiteConfig.country` | -| State/Region | 1st path segment of church URL | -| City | 2nd path segment of church URL | +| State/Region | 1st path segment of church URL (URL-decoded) | +| City | 2nd path segment of church URL (URL-decoded) | | Mass schedule | Mon–Sun table rows: day name → time string (skip `-` entries) | +| External ID | `{domain-slug}/{church-slug}` | + +If `center=` is absent from the iframe src, skip the church with a warning log. Do not fall back to the `q=` parameter (it contains a search query, not coordinates). ### 3. Schedule Parsing -- Day names resolved via existing `src/scrapers/i18n/day-names.ts` patterns -- Portuguese: `Segunda-feira`, `Terça-feira`, `Quarta-feira`, `Quinta-feira`, `Sexta-feira`, `Sábado`, `Domingo` -- Spanish: `Lunes`, `Martes`, `Miércoles`, `Jueves`, `Viernes`, `Sábado`, `Domingo` -- Times are comma-separated within a single cell (e.g. `10:00, 18:00`) — split and create one schedule entry per time +- Use `getDayNamesForCountry(config.country)` from `src/scrapers/i18n/day-names.ts` to get the day-name map keyed by country code (`'BR'`, `'MX'`, etc.) +- Build patterns with `buildDayPatterns(dayNames)` and match against the table's day-name cell text +- Times are comma-separated within a single cell (e.g. `10:00, 18:00`) — split on `,` and create one schedule entry per time - `-` entries indicate no mass that day — skip ### 4. Upsert to Database -- Use `findDuplicateChurch` (existing church-matcher) to check for duplicates before insert -- Upsert church with `source = {domain-slug}` +- Load all existing `buscarmisasNetworkId` values from DB into a `Set` at startup — skip already-imported churches (same pattern as `import-discovermass.ts`) +- Use `findDuplicateChurch()` for new churches to detect cross-source duplicates +- Upsert church with `source = 'buscarmisas-network'` and `buscarmisasNetworkId = externalId` - Upsert mass schedules linked to church -- Track progress: log every 100 churches, support `--resume-from {index}` +- Log progress every 100 churches; support `--resume-from {index}` (single-domain mode only) ### 5. Rate Limiting -- 2-second delay between church page fetches (same as `import-discovermass.ts`) +- 2-second delay between church page fetches - 5-second delay between domains when running `--all` -- Respect HTTP 429 / 503 with exponential backoff (up to 3 retries) +- On HTTP 429 or 503: exponential backoff, up to 3 retries, then skip with warning --- ## Error Handling -- Skip churches where lat/lng cannot be extracted (log warning, continue) -- Skip churches where name is empty -- On fetch error: log and continue to next URL (don't abort the run) +- Skip churches where `center=` lat/lng is absent (log warning, continue) +- Skip churches where name is empty after parsing +- On fetch error: log and continue to next URL - On DB error: log and continue --- ## Integration -- Add `import:buscarmisas-network` script to `package.json` -- Add to scheduler pipeline alongside other importers -- No new dependencies required +### package.json + +```json +"import:buscarmisas-network": "tsx scripts/import-buscarmisas-network.ts" +``` + +### Scheduler (`scripts/scheduler.ts`) + +Add one `PipelinePhase` per domain (5 total) so each country can be scheduled and monitored independently. Each phase's `type` string must match exactly between `PIPELINE_GROUPS` and the `case` label in `getJobCommand()` — mismatch silently throws "Unknown job type". The `type` field in the DB `BackgroundJob` model is a plain `String`, consistent with existing values like `'discovermass-import'`. + +All 5 phases and their corresponding `case` blocks: + +| Phase `type` | `--domain` | +|---|---| +| `buscarmisas-network-BR` | `horariosmissa.com.br` | +| `buscarmisas-network-MX` | `buscarmisas.com.mx` | +| `buscarmisas-network-AR` | `horariosmisa.com.ar` | +| `buscarmisas-network-CO` | `buscarmisas.co` | +| `buscarmisas-network-CL` | `horariomisa.cl` | + +```ts +case 'buscarmisas-network-BR': + return `npx tsx scripts/import-buscarmisas-network.ts --domain horariosmissa.com.br`; +case 'buscarmisas-network-MX': + return `npx tsx scripts/import-buscarmisas-network.ts --domain buscarmisas.com.mx`; +// ... etc for AR, CO, CL +``` ---