From bbef80a782b27cafd4b00c7e944c6baac31a7887 Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Tue, 10 Mar 2026 21:49:51 -0400 Subject: [PATCH] docs: add discovermass.com importer spec and implementation plan 20,284 US churches with mass/confession/adoration schedules. 10s crawl delay (robots.txt), Docker deployment via scheduler. Co-Authored-By: Claude Sonnet 4.6 --- .../plans/2026-03-10-discovermass-importer.md | 1222 +++++++++++++++++ .../specs/2026-03-10-discovermass-design.md | 233 ++++ 2 files changed, 1455 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-10-discovermass-importer.md create mode 100644 docs/superpowers/specs/2026-03-10-discovermass-design.md diff --git a/docs/superpowers/plans/2026-03-10-discovermass-importer.md b/docs/superpowers/plans/2026-03-10-discovermass-importer.md new file mode 100644 index 0000000..2306473 --- /dev/null +++ b/docs/superpowers/plans/2026-03-10-discovermass-importer.md @@ -0,0 +1,1222 @@ +# DiscoverMass.com Importer Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Import 20,284 US Catholic churches with mass/confession/adoration schedules from discovermass.com into the NearestMass database. + +**Architecture:** Enumerate 11 WordPress sitemaps → fetch each church page at 10s intervals (respecting Crawl-delay) → parse server-rendered HTML for name/address/coordinates/schedules → match against existing US churches via church-matcher → upsert with full schedule data. + +**Tech Stack:** TypeScript/tsx, Prisma 7 + PrismaPg adapter, pg Pool, Node.js `fetch`, regex HTML parsing (no DOM library needed — HTML is server-rendered and predictable). + +--- + +## Chunk 1: Schema + church-matcher + +### Task 1: Add discovermassId to schema + +**Files:** +- Modify: `prisma/schema.prisma` + +The schema lives in this repo but migrations run in BethelGuide. After editing schema.prisma here, run `npx prisma generate` to regenerate the Prisma client. Do NOT run `prisma migrate`. + +- [ ] **Step 1: Find the right place in schema.prisma** + +Open `prisma/schema.prisma`. Find the block of source ID fields — they look like: +```prisma +gottesdienstzeitenId String? @unique @map("gottesdienstzeiten_id") +``` +This is inside the `model Church { ... }` block, after `kerknetId` and before `claimed`. + +- [ ] **Step 2: Add discovermassId field** + +After `gottesdienstzeitenId`: +```prisma +discovermassId String? @unique @map("discovermass_id") +``` + +Also find the `@@index` block near the bottom of the Church model (it groups all the index definitions). Add: +```prisma +@@index([discovermassId]) +``` + +- [ ] **Step 3: Regenerate Prisma client** + +```bash +cd /home/albert/Documents/ScraperControl +npx prisma generate +``` + +Expected output: `✔ Generated Prisma Client` (no errors). This does NOT touch the database — it only updates the TypeScript client. + +- [ ] **Step 4: Apply migration to database** + +The schema source of truth is BethelGuide. Run the migration there, then sync back. Since we're on the same dev server: + +```bash +# Check if discovermass_id column already exists (it shouldn't yet) +psql postgresql://postgres:postgres@192.168.0.145:5434/nearestmass -c "\d churches" | grep discovermass +``` + +If the column doesn't exist, apply it directly: +```bash +psql postgresql://postgres:postgres@192.168.0.145:5434/nearestmass -c " +ALTER TABLE churches ADD COLUMN IF NOT EXISTS discovermass_id VARCHAR UNIQUE; +CREATE INDEX IF NOT EXISTS churches_discovermass_id_idx ON churches(discovermass_id); +" +``` + +Expected output: `ALTER TABLE` and `CREATE INDEX` + +- [ ] **Step 5: Verify column exists** + +```bash +psql postgresql://postgres:postgres@192.168.0.145:5434/nearestmass -c "\d churches" | grep discovermass +``` + +Expected output: `discovermass_id | character varying | ...` + +- [ ] **Step 6: Commit** + +```bash +cd /home/albert/Documents/ScraperControl +git add prisma/schema.prisma +git commit -m "feat: add discovermassId field to Church schema" +``` + +--- + +### Task 2: Update church-matcher + +**Files:** +- Modify: `src/lib/church-matcher.ts` + +The `ExistingChurch` interface (line ~11) lists all source IDs. The `ChurchCandidate` type (line ~122) lists optional source IDs for the candidate. The `findDuplicateChurch` function has sequential passes checking each ID before falling back to proximity+name. + +- [ ] **Step 1: Add discovermassId to ExistingChurch interface** + +Find the `export interface ExistingChurch {` block. After the `gottesdienstzeitenId` line, add: +```typescript +discovermassId: string | null; +``` + +- [ ] **Step 2: Add discovermassId to ChurchCandidate type** + +Find `export type ChurchCandidate = {`. After `gottesdienstzeitenId?: string;`, add: +```typescript +discovermassId?: string; +``` + +- [ ] **Step 3: Add discovermassId matching pass in findDuplicateChurch** + +Find the `findDuplicateChurch` function. It has a series of passes like: +```typescript +if (candidate.gottesdienstzeitenId) { + const match = existingChurches.find(c => c.gottesdienstzeitenId === candidate.gottesdienstzeitenId); + if (match) return match; +} +// Proximity + name similarity +``` + +Add a new pass BEFORE the proximity+name pass (after gottesdienstzeitenId): +```typescript +if (candidate.discovermassId) { + const match = existingChurches.find(c => c.discovermassId === candidate.discovermassId); + if (match) return match; +} +``` + +- [ ] **Step 4: Update all callers that construct ExistingChurch objects** + +Search for places that build ExistingChurch objects (the in-memory push after creating a new church). Each importer has a block like: +```typescript +existingChurches.push({ + id: newChurch.id, + ... + gottesdienstzeitenId: null, + ... +}); +``` + +Run: +```bash +grep -rn "gottesdienstzeitenId: null" scripts/ +``` + +For each file found: add `discovermassId: null,` after `gottesdienstzeitenId: null,`. These are the in-memory dedup arrays — they need the new field or TypeScript will complain. + +Also update the `loadExistingChurches` select queries if any importer has one (check with `grep -rn "gottesdienstzeitenId: true" scripts/`). + +- [ ] **Step 5: Verify TypeScript compiles** + +```bash +cd /home/albert/Documents/ScraperControl +npx tsc --noEmit +``` + +Expected: no errors. Fix any type errors (they'll be missing `discovermassId` fields). + +- [ ] **Step 6: Commit** + +```bash +# Stage church-matcher AND all importer scripts that were updated in Step 4 +git add src/lib/church-matcher.ts +git add scripts/ +git commit -m "feat: add discovermassId to church-matcher ExistingChurch and ChurchCandidate" +``` + +--- + +## Chunk 2: import-discovermass.ts — utilities and parsing + +### Task 3: Create file skeleton + utilities + +**Files:** +- Create: `scripts/import-discovermass.ts` + +- [ ] **Step 1: Create the file with header, imports, constants, types** + +Create `scripts/import-discovermass.ts` with this content: + +```typescript +#!/usr/bin/env tsx +/** + * Import Catholic churches and mass schedules from discovermass.com (USA) + * + * discovermass.com is a US Catholic church directory with 20,284 churches. + * Data includes name, address, phone, website, coordinates, mass times, + * confessions, and adoration schedules. + * + * robots.txt specifies Crawl-delay: 10 — this importer follows that rule. + * + * Usage: + * npx tsx scripts/import-discovermass.ts --all + * npx tsx scripts/import-discovermass.ts --all --dry-run + * npx tsx scripts/import-discovermass.ts --all --resume-from 5000 + * npx tsx scripts/import-discovermass.ts --all --job-id {uuid} + */ + +import dotenv from 'dotenv'; +import path from 'path'; + +dotenv.config({ path: path.resolve(process.cwd(), '.env.local') }); +dotenv.config({ path: path.resolve(process.cwd(), '.env') }); + +import { Pool } from 'pg'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; + +const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass'; +console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`); +const pool = new Pool({ + connectionString: dbUrl, + ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined, +}); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +import { findDuplicateChurch } from '../src/lib/church-matcher'; +import type { ExistingChurch } from '../src/lib/church-matcher'; + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const SITE_BASE = 'https://discovermass.com'; +const SITEMAP_COUNT = 11; +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 10_000; // Crawl-delay: 10 from robots.txt + +// ─── Types ─────────────────────────────────────────────────────────────────── + +interface ParsedChurch { + name: string; + address: string | null; + city: string | null; + state: string | null; + zip: string | null; + phone: string | null; + website: string | null; + lat: number; + lng: number; +} + +interface ParsedMass { + dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat + time: string; // HH:MM 24-hour + language: string; + notes?: string; +} + +interface ParsedConf { + dayOfWeek: number; + startTime: string; // HH:MM 24-hour + endTime: string; // HH:MM 24-hour + notes?: string; +} + +interface ParsedAdoration { + dayOfWeek: number; + startTime: string; // HH:MM 24-hour + endTime: string; // HH:MM 24-hour + notes?: string; +} + +interface ImportStats { + total: number; + created: number; + updated: number; + skipped: number; + errors: number; + massSchedulesCreated: number; + confessionSchedulesCreated: number; + adorationSchedulesCreated: number; +} + +interface CLIArgs { + all: boolean; + dryRun: boolean; + resumeFrom?: number; + jobId?: string; +} +``` + +- [ ] **Step 2: Add day mappings and time utilities** + +Append to the file: + +```typescript +// ─── Day Mappings ───────────────────────────────────────────────────────────── + +// Full day names used in mass schedule
  • labels +const FULL_DAY_NAMES: Record = { + Sunday: 0, Monday: 1, Tuesday: 2, Wednesday: 3, + Thursday: 4, Friday: 5, Saturday: 6, +}; + +// Abbreviated day prefixes used in confession/adoration serviceTime text +const ABBREV_DAY_NAMES: Record = { + Sun: [0], Mon: [1], Tue: [2], Wed: [3], + Thr: [4], Thu: [4], Fri: [5], Sat: [6], + Weekdays: [1, 2, 3, 4, 5], + Daily: [0, 1, 2, 3, 4, 5, 6], +}; + +// ─── Time Utilities ─────────────────────────────────────────────────────────── + +/** + * Convert "5:00pm", "11:00am", "12:00pm", "12:00am" to "HH:MM" 24-hour format. + * Returns the original string unchanged if it doesn't match expected format. + */ +function convertTo24h(timeStr: string): string { + const cleaned = timeStr.trim().toLowerCase(); + const m = cleaned.match(/^(\d{1,2}):(\d{2})(am|pm)$/); + if (!m) return cleaned; + let hours = parseInt(m[1], 10); + const mins = m[2]; + const meridiem = m[3]; + if (meridiem === 'pm' && hours !== 12) hours += 12; + if (meridiem === 'am' && hours === 12) hours = 0; + return `${String(hours).padStart(2, '0')}:${mins}`; +} + +/** + * Parse "8:30am-9:00am" → ["08:30", "09:00"]. + * Handles the case where both sides need to infer AM/PM from context. + * E.g. "8:30am-9:00am" → both explicit. "9:00am-6:00pm" → both explicit. + */ +function parseTimeRange(rangeStr: string): [string, string] { + // Split on '-' but careful: times may contain only one '-' between start and end + // Pattern: "8:30am-9:00am" or "3:30pm-4:30pm" + const hyphenIdx = rangeStr.indexOf('-', rangeStr.indexOf(':') + 1); + if (hyphenIdx === -1) { + const t = convertTo24h(rangeStr.trim()); + return [t, t]; + } + const start = convertTo24h(rangeStr.slice(0, hyphenIdx).trim()); + const end = convertTo24h(rangeStr.slice(hyphenIdx + 1).trim()); + return [start, end]; +} + +/** + * Expand abbreviated day prefix to array of dayOfWeek integers. + * Returns empty array if prefix is not recognized. + */ +function expandDayAbbrev(prefix: string): number[] { + return ABBREV_DAY_NAMES[prefix] ?? []; +} + +// ─── Address Parsing ────────────────────────────────────────────────────────── + +/** + * Parse "14085 Peyton Drive, Chino Hills, CA 91709" into components. + * Returns partial result on malformed input. + */ +function parseAddress(raw: string): { address: string | null; city: string | null; state: string | null; zip: string | null } { + const parts = raw.split(', '); + if (parts.length < 3) return { address: raw, city: null, state: null, zip: null }; + const last = parts[parts.length - 1].trim(); + const stateZipMatch = last.match(/^([A-Z]{2})\s+(\d{5}(?:-\d{4})?)$/); + if (!stateZipMatch) return { address: raw, city: null, state: null, zip: null }; + return { + address: parts.slice(0, parts.length - 2).join(', ').trim(), + city: parts[parts.length - 2].trim(), + state: stateZipMatch[1], + zip: stateZipMatch[2], + }; +} +``` + +- [ ] **Step 3: Verify utilities compile** + +```bash +cd /home/albert/Documents/ScraperControl +npx tsc --noEmit +``` + +Expected: no errors related to import-discovermass.ts. Other files may have pre-existing errors — focus only on this file's errors. + +--- + +### Task 4: Add HTML parsing functions + +**Files:** +- Modify: `scripts/import-discovermass.ts` + +The HTML is server-rendered. The page structure has: +- `` for church name +- US address embedded as text in a known pattern +- `