Compare commits
8 Commits
92265cf27f
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
027ca59a01 | ||
|
|
9d0af3289a | ||
|
|
6d1c7eb3c5 | ||
| 206b64b9b8 | |||
|
|
4609fd97db | ||
|
|
2c51513851 | ||
|
|
76cca3ba75 | ||
|
|
3cf1465fb6 |
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
node_modules/
|
||||
.next/
|
||||
.env
|
||||
.env.*
|
||||
.claude/
|
||||
.worktrees/
|
||||
254
CLAUDE.md
Normal file
254
CLAUDE.md
Normal file
@@ -0,0 +1,254 @@
|
||||
# Role in Ecosystem
|
||||
|
||||
**ScraperControl** is the data pipeline for the Church project — handling scraping, enrichment, ChromaDB semantic search, and data transfer to Neon production. It runs on the Synology NAS (Docker), not Vercel.
|
||||
|
||||
- **Schema sync**: Handled by `npm run sync` from the `Church/` root directory. No need to manually copy schema files.
|
||||
- **Coordinated deployment**: Use `npm run deploy` from `Church/` root for full pipeline deployment.
|
||||
- **Schema source of truth**: BethelGuide — never run `prisma migrate` in ScraperControl.
|
||||
|
||||
---
|
||||
|
||||
# Claude Instructions for ScraperControl
|
||||
|
||||
## Project Overview
|
||||
|
||||
**ScraperControl** is the scraping, enrichment, and data management backend for the NearestMass church finder. It provides:
|
||||
|
||||
1. **Admin Dashboard** (Next.js): Job management UI at port 3001
|
||||
2. **Web Scrapers**: Playwright-based scrapers for extracting mass schedules from church websites
|
||||
3. **Enrichment Pipelines**: Google Places, FreeSearch, reverse geocode enrichment
|
||||
4. **ChromaDB Integration**: Semantic search for deduplication, content classification, and change detection
|
||||
5. **Scheduler**: Database-driven job queue for automated scraping
|
||||
|
||||
### Shared Database Architecture
|
||||
|
||||
ScraperControl and BethelGuide share the **same NAS PostgreSQL database** (192.168.0.145:5434). BethelGuide is the **schema source of truth**. After any schema change in BethelGuide:
|
||||
|
||||
1. Copy `BethelGuide/prisma/schema.prisma` → `ScraperControl/prisma/schema.prisma`
|
||||
2. Run `npx prisma generate` in ScraperControl (NOT `migrate`)
|
||||
3. Rebuild Docker containers if needed
|
||||
|
||||
---
|
||||
|
||||
## Tech Stack
|
||||
|
||||
| Layer | Technology |
|
||||
|-------|------------|
|
||||
| Admin UI | Next.js 16, React 19, Tailwind CSS v4 |
|
||||
| Database | Shared NAS PostgreSQL (192.168.0.145:5434) |
|
||||
| ORM | Prisma 7 (`@prisma/adapter-pg` + `pg` Pool) |
|
||||
| Web Scraping | Playwright (headless Chromium) |
|
||||
| Vector DB | ChromaDB (192.168.0.145:8000) |
|
||||
| Embeddings | Ollama on MacBook (192.168.0.75:11434) with nomic-embed-text |
|
||||
| Scheduling | node-cron + database-driven job queue |
|
||||
| Containerization | Docker, Docker Compose |
|
||||
|
||||
---
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
src/
|
||||
├── app/ # Next.js Admin Dashboard (port 3001)
|
||||
│ ├── page.tsx # Main dashboard (Jobs, Scrapes, Search tabs)
|
||||
│ └── api/admin/ # Admin API routes
|
||||
│ ├── jobs/ # Job management (GET/POST/PATCH)
|
||||
│ ├── scrape-log/ # Recently scraped churches log
|
||||
│ └── freesearch-log/ # FreeSearch results log
|
||||
│
|
||||
├── chromadb/ # ChromaDB integration
|
||||
│ ├── client.ts # ChromaDB client singleton
|
||||
│ ├── embeddings.ts # OpenAI-compatible embedding helper (Ollama)
|
||||
│ ├── collections.ts # Collection definitions (5 collections)
|
||||
│ └── queries.ts # Query helpers per use case
|
||||
│
|
||||
├── lib/ # Core business logic
|
||||
│ ├── db.ts # Prisma client singleton
|
||||
│ ├── admin-auth.ts # Timing-safe API key auth
|
||||
│ ├── geo.ts # Haversine distance (minimal)
|
||||
│ ├── scraper-service.ts # Scraper orchestration
|
||||
│ ├── overpass-client.ts # OpenStreetMap Overpass API
|
||||
│ ├── church-matcher.ts # Church matching/dedup
|
||||
│ └── masstimes-scraper.ts # MassTimes.org integration
|
||||
│
|
||||
└── scrapers/ # Web scraping system
|
||||
├── base-scraper.ts # Base class
|
||||
├── index.ts # Exports
|
||||
├── registry.ts # Strategy registry
|
||||
├── url-discovery.ts # Mass schedule URL finder
|
||||
├── strategies/ # Language-specific scrapers
|
||||
│ ├── generic.ts # Fallback (10+ languages)
|
||||
│ ├── english.ts
|
||||
│ ├── french.ts
|
||||
│ ├── german.ts
|
||||
│ ├── italian.ts
|
||||
│ └── spanish.ts
|
||||
└── i18n/ # Internationalization
|
||||
├── day-names.ts # Day name patterns per language
|
||||
└── day-ranges.ts # Day range parsing ("Monday-Friday")
|
||||
|
||||
scripts/ # CLI scripts
|
||||
├── scrape-churches.ts # Scrape churches by language
|
||||
├── scrape-masstimes.ts # Scrape from MassTimes.org
|
||||
├── import-osm-churches.ts # Import from OpenStreetMap
|
||||
├── import-osm-region.ts # Import specific OSM region
|
||||
├── enrich-with-google-places.ts # Google Places enrichment
|
||||
├── enrich-with-freesearch.ts # FreeSearch website enrichment
|
||||
├── enrich-with-reverse-geocode.ts # Reverse geocode enrichment
|
||||
├── scheduler.ts # Background job scheduler
|
||||
├── dedup-mass-schedules.ts # Mass schedule deduplication
|
||||
├── dedup-churches.ts # Church dedup via ChromaDB
|
||||
├── transfer-enriched-to-neon.ts # NAS → Neon production sync
|
||||
├── populate-chromadb.ts # Bulk-populate ChromaDB collections
|
||||
├── populate-city-normalized.ts
|
||||
├── save-schedules-to-db.ts
|
||||
├── test-scraper.ts # Test scraper on a URL
|
||||
├── test-url-discovery.ts # Test URL discovery
|
||||
├── test-edge-cases.ts # International edge case tests
|
||||
└── debug/ # Debug/investigation scripts (~44 files)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Commands
|
||||
|
||||
```bash
|
||||
# === DEVELOPMENT ===
|
||||
npm run dev # Start admin dashboard (localhost:3001)
|
||||
npm run build # Build Next.js app
|
||||
|
||||
# === SCRAPING ===
|
||||
npm run scrape:churches # Scrape churches (pass --language, --all flags)
|
||||
npm run scrape:masstimes # Scrape from MassTimes.org
|
||||
npm run test:scraper # Test scraper on a URL
|
||||
npm run test:discover # Test URL discovery
|
||||
|
||||
# === ENRICHMENT ===
|
||||
npm run enrich:places # Google Places enrichment
|
||||
npm run enrich:freesearch # FreeSearch website enrichment
|
||||
|
||||
# === DATA MANAGEMENT ===
|
||||
npm run dedup:masses # Deduplicate mass schedules
|
||||
npm run import:osm # Import churches from OpenStreetMap
|
||||
npm run transfer:neon # Transfer enriched data to Neon production
|
||||
npm run scheduler # Start background job scheduler
|
||||
|
||||
# === CHROMADB ===
|
||||
npx tsx scripts/populate-chromadb.ts --all # Populate all collections
|
||||
npx tsx scripts/populate-chromadb.ts --collection church_identity # Single collection
|
||||
npx tsx scripts/dedup-churches.ts --threshold 0.15 # Find duplicate churches
|
||||
|
||||
# === DOCKER (on NAS) ===
|
||||
docker compose build scraper # Build scraper image
|
||||
docker compose --profile tools run --rm scraper <command> # Run one-off scraper
|
||||
docker compose up -d scheduler freesearch-enrichment # Start background services
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ChromaDB Integration
|
||||
|
||||
### Collections
|
||||
|
||||
| Collection | Purpose | Documents |
|
||||
|---|---|---|
|
||||
| `church_identity` | Deduplication | `{name} {address} {city} {country}` |
|
||||
| `search_results` | FreeSearch matching | `{title} {snippet} {url}` |
|
||||
| `page_classification` | Content classification | Page text (first 2000 chars) |
|
||||
| `schedule_sections` | Schedule detection | Text blocks with mass times |
|
||||
| `page_snapshots` | Change detection | Full page text |
|
||||
|
||||
### Infrastructure
|
||||
|
||||
- **ChromaDB server**: `http://192.168.0.145:8000` (on NAS)
|
||||
- **Embedding API**: `http://192.168.0.75:11434/v1` (Ollama on MacBook M1)
|
||||
- **Embedding model**: `nomic-embed-text` (~270MB, fast on M1)
|
||||
|
||||
### Prerequisite
|
||||
|
||||
Ollama must be running on the MacBook with LAN access enabled:
|
||||
```bash
|
||||
OLLAMA_HOST=0.0.0.0 ollama serve
|
||||
ollama pull nomic-embed-text
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Docker Services
|
||||
|
||||
| Service | Profile | Purpose |
|
||||
|---|---|---|
|
||||
| app | (default) | Admin dashboard on port 3001 |
|
||||
| scraper | tools | Generic scraper (on-demand) |
|
||||
| scraper-english | scraper-english | English language scraper |
|
||||
| scraper-french | scraper-french | French language scraper |
|
||||
| scraper-german | scraper-german | German language scraper |
|
||||
| scraper-italian | scraper-italian | Italian language scraper |
|
||||
| scraper-spanish | scraper-spanish | Spanish language scraper |
|
||||
| scraper-generic | scraper-generic | Generic fallback scraper |
|
||||
| scheduler | (default) | Background job scheduler |
|
||||
| freesearch-enrichment | (default) | FreeSearch enrichment daemon |
|
||||
|
||||
---
|
||||
|
||||
## Environment Variables
|
||||
|
||||
```env
|
||||
DATABASE_URL="postgresql://postgres:postgres@192.168.0.145:5434/nearestmass"
|
||||
ADMIN_API_KEY=your-secret-key
|
||||
CHROMADB_URL=http://192.168.0.145:8000
|
||||
EMBEDDING_API_URL=http://192.168.0.75:11434/v1
|
||||
EMBEDDING_MODEL=nomic-embed-text
|
||||
GOOGLE_PLACES_API_KEY=your-google-key
|
||||
FREESEARCH_URL=http://192.168.0.145:3111
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## NAS Deployment
|
||||
|
||||
ScraperControl is deployed on the Synology NAS at `/volume1/docker/scraper-control/`.
|
||||
|
||||
### Container Layout
|
||||
|
||||
| Container | Purpose | Port |
|
||||
|-----------|---------|------|
|
||||
| scraper-control-app-1 | Admin dashboard | 3001 |
|
||||
| scraper-control-scheduler-1 | Job scheduler | - |
|
||||
| scraper-control-freesearch-enrichment-1 | FreeSearch daemon | - |
|
||||
|
||||
The `db` container (`nearestmass-db-1`) is managed by BethelGuide's compose file at `/volume1/docker/nearestmass/`. ScraperControl joins the same `nearestmass_default` external Docker network — no `depends_on` allowed since `db` is in a different compose file.
|
||||
|
||||
### Deploying Updates
|
||||
|
||||
```bash
|
||||
# From local machine:
|
||||
bash scripts/deploy-to-nas.sh
|
||||
|
||||
# Or manually:
|
||||
rsync -avz --exclude 'node_modules' --exclude '.next' --exclude '.git' --exclude '.env.local' --exclude '*.log' \
|
||||
/Users/albert/Documents/Projects/Church/ScraperControl/ albert@192.168.0.145:/volume1/docker/scraper-control/
|
||||
|
||||
ssh albert@192.168.0.145 'cd /volume1/docker/scraper-control && /usr/local/bin/docker compose build scraper && /usr/local/bin/docker compose up -d scheduler freesearch-enrichment'
|
||||
```
|
||||
|
||||
### Rebuilding Admin Dashboard
|
||||
|
||||
```bash
|
||||
ssh albert@192.168.0.145 'cd /volume1/docker/scraper-control && /usr/local/bin/docker compose build app && /usr/local/bin/docker compose up -d app'
|
||||
```
|
||||
|
||||
### Important Notes
|
||||
|
||||
- **DO NOT** add `depends_on: db` to any service — `db` is in BethelGuide's compose file
|
||||
- The `.env` on NAS uses host IP (`192.168.0.145:5434`) for scripts run outside Docker
|
||||
- The `docker-compose.yml` environment overrides use `db:5432` (Docker DNS via shared network)
|
||||
- Docker binary on NAS is at `/usr/local/bin/docker`
|
||||
|
||||
### NAS Docker Health
|
||||
|
||||
The Synology NAS (4 CPU, 17GB RAM) runs 23 containers across 7 projects. Church project containers (5) all have memory limits and log rotation. See `memory/nas-docker-health.md` for full inventory.
|
||||
|
||||
**Scheduler hardening**: Uses `detached: true` + process group kill to prevent orphaned Chromium processes, `init: true` for zombie reaping, 24h job timeout, 8GB memory limit.
|
||||
|
||||
**Maintenance**: Docker is on /volume1 (15TB free). Run `docker builder prune -f` occasionally to keep build cache tidy.
|
||||
30
Dockerfile
Normal file
30
Dockerfile
Normal file
@@ -0,0 +1,30 @@
|
||||
FROM node:20-alpine AS deps
|
||||
WORKDIR /app
|
||||
COPY package.json package-lock.json* ./
|
||||
COPY prisma ./prisma/
|
||||
RUN npm ci && npx prisma generate
|
||||
|
||||
FROM node:20-alpine AS builder
|
||||
WORKDIR /app
|
||||
COPY --from=deps /app/node_modules ./node_modules
|
||||
COPY . .
|
||||
ENV NEXT_TELEMETRY_DISABLED=1
|
||||
RUN npm run build
|
||||
|
||||
FROM node:20-alpine AS runner
|
||||
WORKDIR /app
|
||||
ENV NODE_ENV=production
|
||||
ENV NEXT_TELEMETRY_DISABLED=1
|
||||
ENV PORT=3001
|
||||
ENV HOSTNAME="0.0.0.0"
|
||||
|
||||
RUN addgroup --system --gid 1001 nodejs && \
|
||||
adduser --system --uid 1001 nextjs
|
||||
|
||||
COPY --from=builder /app/public ./public
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static
|
||||
|
||||
USER nextjs
|
||||
EXPOSE 3001
|
||||
CMD ["node", "server.js"]
|
||||
21
Dockerfile.scraper
Normal file
21
Dockerfile.scraper
Normal file
@@ -0,0 +1,21 @@
|
||||
FROM node:20-bookworm-slim
|
||||
|
||||
# Install Playwright system dependencies + Chromium
|
||||
RUN apt-get update && \
|
||||
npx playwright install --with-deps chromium && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package.json package-lock.json ./
|
||||
COPY prisma ./prisma/
|
||||
RUN npm ci
|
||||
RUN npx prisma generate
|
||||
|
||||
COPY src ./src/
|
||||
COPY scripts ./scripts/
|
||||
COPY tsconfig.json ./
|
||||
|
||||
# Default: run the masstimes scraper
|
||||
CMD ["npx", "tsx", "scripts/scrape-masstimes.ts"]
|
||||
43
docs/plans/2026-02-25-parallel-scrapers-design.md
Normal file
43
docs/plans/2026-02-25-parallel-scrapers-design.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Parallel Scrapers with Country Mapping Fix
|
||||
|
||||
## Problem
|
||||
|
||||
The scheduler runs scrapers sequentially — one language at a time. With 19,996 unscraped churches queued across 10 language scrapers, a full cycle takes days. The English scraper alone runs 30+ hours. Additionally, 1,414 churches in unmapped countries (BE, CH, IN, etc.) fall through to the generic scraper instead of being handled by appropriate language scrapers.
|
||||
|
||||
## Changes
|
||||
|
||||
### 1. Country Mapping Additions (scraper-service.ts)
|
||||
|
||||
Add to `COUNTRY_SCRAPER_MAP`:
|
||||
- **English**: IN, SG, MY, KE, JM, TT, GH, NG, ZA, TZ, UG
|
||||
- **French**: BE, LU
|
||||
- **German**: CH, SI
|
||||
- **Italian**: HR, RO
|
||||
|
||||
### 2. Parallel Pipeline Groups (scheduler.ts)
|
||||
|
||||
Replace sequential `PIPELINE_PHASES` array with grouped phases:
|
||||
|
||||
| Group | Phases | Concurrency |
|
||||
|-------|--------|-------------|
|
||||
| 1 | osm-import, gcatholic-import | Sequential (shared data) |
|
||||
| 2 | english, french, german | Parallel (3) |
|
||||
| 3 | polish, spanish, italian | Parallel (3) |
|
||||
| 4 | portuguese, czech, dutch | Parallel (3) |
|
||||
| 5 | hungarian, generic | Parallel (2) |
|
||||
|
||||
Scheduler starts all jobs in a group simultaneously, waits for all to finish, then advances to the next group.
|
||||
|
||||
### 3. Generic Scraper Deprioritized
|
||||
|
||||
- Moved to last group
|
||||
- Pre-check query: skip if no unscraped churches in generic queue (avoids wasteful re-scrapes)
|
||||
|
||||
### 4. Resource Changes
|
||||
|
||||
- Scheduler container memory limit: 4GB → 10GB (3 concurrent Playwright/Chromium processes)
|
||||
- No new Docker containers or compose changes needed — existing child process spawning approach is kept
|
||||
|
||||
## Approach
|
||||
|
||||
Approach B: parallel child processes inside the scheduler container. No Docker-in-Docker. The scheduler already spawns `npx tsx` processes — we just allow multiple to run concurrently instead of waiting for each to finish before starting the next.
|
||||
423
docs/plans/2026-02-25-parallel-scrapers.md
Normal file
423
docs/plans/2026-02-25-parallel-scrapers.md
Normal file
@@ -0,0 +1,423 @@
|
||||
# Parallel Scrapers Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Run language scrapers in parallel groups of 3, add missing country mappings, and deprioritize the generic scraper.
|
||||
|
||||
**Architecture:** Replace sequential pipeline phases with grouped phases. Groups run their jobs concurrently (max 3), then wait for all to complete before advancing. Import phases stay sequential. The scheduler tracks a `groupJobsRemaining` counter per group instead of advancing on every job completion.
|
||||
|
||||
**Tech Stack:** TypeScript, node child_process spawn, Prisma, Docker Compose
|
||||
|
||||
---
|
||||
|
||||
### Task 1: Add Missing Country Mappings
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/lib/scraper-service.ts:29-45`
|
||||
|
||||
**Step 1: Update COUNTRY_SCRAPER_MAP**
|
||||
|
||||
Add these entries to the existing `COUNTRY_SCRAPER_MAP` object at `src/lib/scraper-service.ts:29`:
|
||||
|
||||
```typescript
|
||||
const COUNTRY_SCRAPER_MAP: Record<string, string> = {
|
||||
US: 'english', CA: 'english', GB: 'english',
|
||||
AU: 'english', NZ: 'english', IE: 'english', PH: 'english',
|
||||
IN: 'english', SG: 'english', MY: 'english', KE: 'english',
|
||||
JM: 'english', TT: 'english', GH: 'english', NG: 'english',
|
||||
ZA: 'english', TZ: 'english', UG: 'english',
|
||||
FR: 'french', BE: 'french', LU: 'french',
|
||||
ES: 'spanish', MX: 'spanish', AR: 'spanish', CO: 'spanish',
|
||||
CL: 'spanish', PE: 'spanish', EC: 'spanish', VE: 'spanish',
|
||||
CR: 'spanish', PA: 'spanish', GT: 'spanish', CU: 'spanish',
|
||||
HN: 'spanish', SV: 'spanish', NI: 'spanish', BO: 'spanish',
|
||||
PY: 'spanish', UY: 'spanish', DO: 'spanish',
|
||||
IT: 'italian', SM: 'italian', VA: 'italian',
|
||||
HR: 'italian', RO: 'italian',
|
||||
DE: 'german', AT: 'german', LI: 'german',
|
||||
CH: 'german', SI: 'german',
|
||||
PL: 'polish',
|
||||
PT: 'portuguese', BR: 'portuguese',
|
||||
NL: 'dutch',
|
||||
CZ: 'czech', SK: 'czech',
|
||||
HU: 'hungarian',
|
||||
};
|
||||
```
|
||||
|
||||
Also update `buildLanguageFilter` at `src/lib/scraper-service.ts:346-463` to include the new countries in each language filter's country list:
|
||||
|
||||
- `english` filter (line 356): add `'IN', 'SG', 'MY', 'KE', 'JM', 'TT', 'GH', 'NG', 'ZA', 'TZ', 'UG'`
|
||||
- `french` filter (line 366): add `'BE', 'LU'` → `{ in: ['FR', 'BE', 'LU'] }`
|
||||
- `spanish` filter: already has all needed countries
|
||||
- `italian` filter (line 387): add `'HR', 'RO'` → `{ in: ['IT', 'SM', 'VA', 'HR', 'RO'] }`
|
||||
- `german` filter (line 397): add `'CH', 'SI'` → `{ in: ['DE', 'AT', 'LI', 'CH', 'SI'] }`
|
||||
|
||||
**Step 2: Verify build**
|
||||
|
||||
Run: `npm run build`
|
||||
Expected: Build succeeds with no errors
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add src/lib/scraper-service.ts
|
||||
git commit -m "feat: add missing country mappings to language scrapers
|
||||
|
||||
Add BE/LU→french, CH/SI→german, HR/RO→italian, IN/SG/MY/KE/JM/TT/GH/NG/ZA/TZ/UG→english.
|
||||
~1,400 previously unmapped churches now routed to proper language scrapers."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Rewrite Scheduler for Parallel Groups
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/scheduler.ts`
|
||||
|
||||
**Step 1: Replace pipeline data structure**
|
||||
|
||||
Replace the `PipelinePhase` interface, `PIPELINE_PHASES` array (lines 27-49), and `CycleState` interface (lines 53-69) with:
|
||||
|
||||
```typescript
|
||||
interface PipelinePhase {
|
||||
name: string;
|
||||
type: string;
|
||||
language?: string;
|
||||
config: Record<string, unknown>;
|
||||
}
|
||||
|
||||
interface PipelineGroup {
|
||||
name: string;
|
||||
phases: PipelinePhase[];
|
||||
mode: 'sequential' | 'parallel';
|
||||
}
|
||||
|
||||
const PIPELINE_GROUPS: PipelineGroup[] = [
|
||||
{
|
||||
name: 'imports',
|
||||
mode: 'sequential',
|
||||
phases: [
|
||||
{ name: 'osm-import-p1', type: 'osm-import', config: { priority: 1 } },
|
||||
{ name: 'gcatholic-import', type: 'gcatholic-import', config: { delay: 2000 } },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'scrapers-batch-1',
|
||||
mode: 'parallel',
|
||||
phases: [
|
||||
{ name: 'scraper-english', type: 'scraper', language: 'english', config: { allMode: true, maxFailures: 10, language: 'english' } },
|
||||
{ name: 'scraper-french', type: 'scraper', language: 'french', config: { allMode: true, maxFailures: 10, language: 'french' } },
|
||||
{ name: 'scraper-german', type: 'scraper', language: 'german', config: { allMode: true, maxFailures: 10, language: 'german' } },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'scrapers-batch-2',
|
||||
mode: 'parallel',
|
||||
phases: [
|
||||
{ name: 'scraper-polish', type: 'scraper', language: 'polish', config: { allMode: true, maxFailures: 10, language: 'polish' } },
|
||||
{ name: 'scraper-spanish', type: 'scraper', language: 'spanish', config: { allMode: true, maxFailures: 10, language: 'spanish' } },
|
||||
{ name: 'scraper-italian', type: 'scraper', language: 'italian', config: { allMode: true, maxFailures: 10, language: 'italian' } },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'scrapers-batch-3',
|
||||
mode: 'parallel',
|
||||
phases: [
|
||||
{ name: 'scraper-portuguese', type: 'scraper', language: 'portuguese', config: { allMode: true, maxFailures: 10, language: 'portuguese' } },
|
||||
{ name: 'scraper-czech', type: 'scraper', language: 'czech', config: { allMode: true, maxFailures: 10, language: 'czech' } },
|
||||
{ name: 'scraper-dutch', type: 'scraper', language: 'dutch', config: { allMode: true, maxFailures: 10, language: 'dutch' } },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'scrapers-batch-4',
|
||||
mode: 'parallel',
|
||||
phases: [
|
||||
{ name: 'scraper-hungarian', type: 'scraper', language: 'hungarian', config: { allMode: true, maxFailures: 10, language: 'hungarian' } },
|
||||
{ name: 'scraper-generic', type: 'scraper', language: 'generic', config: { allMode: true, maxFailures: 10, language: 'generic' } },
|
||||
],
|
||||
},
|
||||
];
|
||||
```
|
||||
|
||||
**Step 2: Replace CycleState**
|
||||
|
||||
```typescript
|
||||
interface CycleState {
|
||||
currentGroupIndex: number;
|
||||
currentSequentialPhaseIndex: number; // for sequential groups, tracks which phase within the group
|
||||
cycleNumber: number;
|
||||
cycleStartedAt: Date | null;
|
||||
lastCycleCompletedAt: Date | null;
|
||||
waitingForCooldown: boolean;
|
||||
activeGroupJobs: number; // how many jobs still running in the current group
|
||||
}
|
||||
|
||||
const cycleState: CycleState = {
|
||||
currentGroupIndex: 0,
|
||||
currentSequentialPhaseIndex: 0,
|
||||
cycleNumber: 0,
|
||||
cycleStartedAt: null,
|
||||
lastCycleCompletedAt: null,
|
||||
waitingForCooldown: false,
|
||||
activeGroupJobs: 0,
|
||||
};
|
||||
```
|
||||
|
||||
**Step 3: Rewrite pollAndAdvancePipeline**
|
||||
|
||||
Replace the entire `pollAndAdvancePipeline` function (lines 306-385) and `advancePipelinePhase` function (lines 387-390) with:
|
||||
|
||||
```typescript
|
||||
async function pollAndAdvancePipeline(): Promise<void> {
|
||||
try {
|
||||
// 1. Check for manual pending jobs from admin API (priority over pipeline)
|
||||
if (runningJobs.size === 0) {
|
||||
const manualJob = await prisma.backgroundJob.findFirst({
|
||||
where: {
|
||||
status: 'pending',
|
||||
NOT: { config: { path: ['pipelineManaged'], equals: true } },
|
||||
},
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (manualJob) {
|
||||
log(`Found manual job: ${manualJob.type}${manualJob.language ? `:${manualJob.language}` : ''} (${manualJob.id})`);
|
||||
await startJobProcess(
|
||||
manualJob.id,
|
||||
manualJob.type,
|
||||
manualJob.language,
|
||||
manualJob.config as Record<string, unknown> | null
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// 2. If jobs are still running for the current group, wait
|
||||
if (cycleState.activeGroupJobs > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 3. If in cooldown, check if expired
|
||||
if (cycleState.waitingForCooldown) {
|
||||
if (cycleState.lastCycleCompletedAt) {
|
||||
const elapsed = Date.now() - cycleState.lastCycleCompletedAt.getTime();
|
||||
if (elapsed < CYCLE_COOLDOWN_MS) {
|
||||
const remaining = Math.round((CYCLE_COOLDOWN_MS - elapsed) / 60_000);
|
||||
if (remaining % 30 === 0 || remaining <= 5) {
|
||||
log(`Cooldown: ${remaining} minutes remaining before next cycle`);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
cycleState.waitingForCooldown = false;
|
||||
cycleState.currentGroupIndex = 0;
|
||||
cycleState.currentSequentialPhaseIndex = 0;
|
||||
log('Cooldown expired, starting new cycle');
|
||||
}
|
||||
|
||||
// 4. If past the last group, complete the cycle
|
||||
if (cycleState.currentGroupIndex >= PIPELINE_GROUPS.length) {
|
||||
cycleState.cycleNumber++;
|
||||
cycleState.lastCycleCompletedAt = new Date();
|
||||
cycleState.waitingForCooldown = true;
|
||||
const cooldownHours = CYCLE_COOLDOWN_MS / (60 * 60 * 1000);
|
||||
log(`=== Cycle ${cycleState.cycleNumber} complete! Entering ${cooldownHours}h cooldown ===`);
|
||||
return;
|
||||
}
|
||||
|
||||
// 5. Start the current group
|
||||
const group = PIPELINE_GROUPS[cycleState.currentGroupIndex];
|
||||
|
||||
if (cycleState.currentGroupIndex === 0 && cycleState.currentSequentialPhaseIndex === 0 && !cycleState.cycleStartedAt) {
|
||||
cycleState.cycleStartedAt = new Date();
|
||||
log(`=== Starting cycle ${cycleState.cycleNumber + 1} ===`);
|
||||
}
|
||||
|
||||
if (group.mode === 'parallel') {
|
||||
// Launch all phases in the group concurrently
|
||||
log(`Pipeline group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length}: ${group.name} (parallel, ${group.phases.length} jobs)`);
|
||||
cycleState.activeGroupJobs = group.phases.length;
|
||||
|
||||
for (const phase of group.phases) {
|
||||
const jobId = await createPendingJob(
|
||||
phase.type,
|
||||
phase.language,
|
||||
{ ...phase.config, pipelineManaged: true }
|
||||
);
|
||||
await startJobProcess(jobId, phase.type, phase.language || null, phase.config);
|
||||
}
|
||||
} else {
|
||||
// Sequential: run one phase at a time within the group
|
||||
const phaseIndex = cycleState.currentSequentialPhaseIndex;
|
||||
if (phaseIndex >= group.phases.length) {
|
||||
// All phases in this sequential group are done
|
||||
cycleState.currentGroupIndex++;
|
||||
cycleState.currentSequentialPhaseIndex = 0;
|
||||
return; // Will pick up next group on next poll
|
||||
}
|
||||
|
||||
const phase = group.phases[phaseIndex];
|
||||
log(`Pipeline group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length}: ${group.name} (sequential ${phaseIndex + 1}/${group.phases.length}: ${phase.name})`);
|
||||
cycleState.activeGroupJobs = 1;
|
||||
|
||||
const jobId = await createPendingJob(
|
||||
phase.type,
|
||||
phase.language,
|
||||
{ ...phase.config, pipelineManaged: true }
|
||||
);
|
||||
await startJobProcess(jobId, phase.type, phase.language || null, phase.config);
|
||||
}
|
||||
} catch (err) {
|
||||
logError(`Error in pipeline: ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
function onJobCompleted(): void {
|
||||
cycleState.activeGroupJobs--;
|
||||
|
||||
if (cycleState.activeGroupJobs <= 0) {
|
||||
cycleState.activeGroupJobs = 0;
|
||||
const group = PIPELINE_GROUPS[cycleState.currentGroupIndex];
|
||||
|
||||
if (group?.mode === 'sequential') {
|
||||
cycleState.currentSequentialPhaseIndex++;
|
||||
// Check if there are more phases in this sequential group
|
||||
if (cycleState.currentSequentialPhaseIndex < group.phases.length) {
|
||||
return; // Don't advance group yet
|
||||
}
|
||||
}
|
||||
|
||||
// Advance to next group
|
||||
cycleState.currentGroupIndex++;
|
||||
cycleState.currentSequentialPhaseIndex = 0;
|
||||
log(`Group "${group?.name}" complete, advancing to group ${cycleState.currentGroupIndex + 1}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Step 4: Update startJobProcess callbacks**
|
||||
|
||||
In the `child.on('close')` callback (line 442) and `child.on('error')` callback (line 472), replace `advancePipelinePhase()` with `onJobCompleted()`.
|
||||
|
||||
**Step 5: Update crash recovery**
|
||||
|
||||
In `recoverFromCrash` (lines 259-268), replace the `PIPELINE_PHASES.findIndex` logic with a search through `PIPELINE_GROUPS`:
|
||||
|
||||
```typescript
|
||||
if (lastRunningPipelineJob) {
|
||||
for (let gi = 0; gi < PIPELINE_GROUPS.length; gi++) {
|
||||
const group = PIPELINE_GROUPS[gi];
|
||||
const phaseIdx = group.phases.findIndex(
|
||||
p => p.type === lastRunningPipelineJob.type &&
|
||||
(p.language || null) === (lastRunningPipelineJob.language || null)
|
||||
);
|
||||
if (phaseIdx >= 0) {
|
||||
cycleState.currentGroupIndex = gi;
|
||||
cycleState.currentSequentialPhaseIndex = group.mode === 'sequential' ? phaseIdx : 0;
|
||||
log(`Resuming pipeline from group ${gi + 1}: ${group.name}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Step 6: Update heartbeat log in main()**
|
||||
|
||||
Replace the heartbeat cron (lines 551-562) and the startup log (lines 574-580) to reference groups instead of phases:
|
||||
|
||||
```typescript
|
||||
cron.schedule('0 * * * *', () => {
|
||||
const currentGroup = cycleState.currentGroupIndex < PIPELINE_GROUPS.length
|
||||
? PIPELINE_GROUPS[cycleState.currentGroupIndex].name
|
||||
: 'none';
|
||||
const jobs = runningJobs.size > 0
|
||||
? `Running: ${[...runningJobs.keys()].join(', ')}`
|
||||
: 'No jobs running';
|
||||
const state = cycleState.waitingForCooldown
|
||||
? 'cooldown'
|
||||
: `group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length} (${currentGroup})`;
|
||||
log(`Heartbeat: Cycle ${cycleState.cycleNumber + 1}, ${state}. ${jobs}`);
|
||||
}, { timezone: 'UTC' });
|
||||
```
|
||||
|
||||
For the startup log:
|
||||
|
||||
```typescript
|
||||
log('=== Scheduler running (parallel grouped pipeline) ===');
|
||||
log(`Pipeline groups (${PIPELINE_GROUPS.length}):`);
|
||||
for (let i = 0; i < PIPELINE_GROUPS.length; i++) {
|
||||
const g = PIPELINE_GROUPS[i];
|
||||
const phaseNames = g.phases.map(p => p.name).join(', ');
|
||||
log(` ${i + 1}. ${g.name} [${g.mode}]: ${phaseNames}`);
|
||||
}
|
||||
```
|
||||
|
||||
**Step 7: Remove dead Google Places env log**
|
||||
|
||||
Delete lines 167-169 (the `GOOGLE_PLACES_API_KEY` log in `validateEnvironment`).
|
||||
|
||||
**Step 8: Verify build**
|
||||
|
||||
Run: `npm run build`
|
||||
Expected: Build succeeds
|
||||
|
||||
**Step 9: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/scheduler.ts
|
||||
git commit -m "feat: parallel grouped pipeline scheduler
|
||||
|
||||
Replace sequential pipeline with grouped phases. Import phases run
|
||||
sequentially, scraper phases run in parallel groups of 3. This reduces
|
||||
cycle time from days to hours. Generic scraper moved to last group."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Increase Scheduler Memory Limit
|
||||
|
||||
**Files:**
|
||||
- Modify: `docker-compose.yml:217-220`
|
||||
|
||||
**Step 1: Increase memory limit**
|
||||
|
||||
Change the scheduler service's `deploy.resources.limits.memory` from `4G` to `10G`:
|
||||
|
||||
```yaml
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 10G
|
||||
```
|
||||
|
||||
**Step 2: Commit**
|
||||
|
||||
```bash
|
||||
git add docker-compose.yml
|
||||
git commit -m "chore: increase scheduler memory to 10G for parallel scrapers"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Deploy and Verify
|
||||
|
||||
**Step 1: Deploy to NAS**
|
||||
|
||||
```bash
|
||||
rsync -avz --exclude 'node_modules' --exclude '.next' --exclude '.git' --exclude '.env.local' --exclude '*.log' \
|
||||
/Users/albert/Documents/Projects/Church/ScraperControl/ albert@192.168.0.145:/volume1/docker/scraper-control/
|
||||
```
|
||||
|
||||
**Step 2: Rebuild and restart scheduler**
|
||||
|
||||
```bash
|
||||
ssh albert@192.168.0.145 'cd /volume1/docker/scraper-control && /usr/local/bin/docker compose build scheduler && /usr/local/bin/docker compose up -d scheduler'
|
||||
```
|
||||
|
||||
**Step 3: Verify logs show parallel groups**
|
||||
|
||||
```bash
|
||||
ssh albert@192.168.0.145 '/usr/local/bin/docker logs --tail 30 scraper-control-scheduler-1'
|
||||
```
|
||||
|
||||
Expected: Logs show "parallel grouped pipeline", group listings with `[parallel]` and `[sequential]` tags, and eventually multiple concurrent `Running:` entries in heartbeat.
|
||||
72
docs/plans/2026-02-26-horariosmisas-spain-design.md
Normal file
72
docs/plans/2026-02-26-horariosmisas-spain-design.md
Normal file
@@ -0,0 +1,72 @@
|
||||
# Spain Church Importer (horariosmisas.com) — Design
|
||||
|
||||
## Overview
|
||||
|
||||
Import ~10,000 Spanish churches with mass schedules from horariosmisas.com. Static WordPress site with fully permissive robots.txt and sitemaps. No Playwright needed — simple HTTP + HTML parsing.
|
||||
|
||||
## Data Source
|
||||
|
||||
- **Site:** https://horariosmisas.com
|
||||
- **Coverage:** 18,000+ churches claimed, ~10,000 in sitemaps across 52 Spanish provinces
|
||||
- **Data:** Church name, address, phone, website, mass schedules (summer/winter seasonal variants)
|
||||
- **No coordinates** — addresses only. Forward geocoding via Nominatim as a separate pass.
|
||||
- **robots.txt:** Fully permissive (`User-agent: * / Disallow:`)
|
||||
- **Sitemaps:** 20 post sitemaps + 7 category sitemaps
|
||||
|
||||
## Architecture
|
||||
|
||||
### Two-Pass Approach
|
||||
|
||||
**Pass 1: Import** — Fetch all churches from sitemaps, parse HTML, match against existing Spanish OSM churches, upsert with mass schedules. Unmatched churches created with address but no coordinates.
|
||||
|
||||
**Pass 2: Geocode** — Forward-geocode unmatched churches via Nominatim public API (`address → lat/lng`). 1 req/sec rate limit.
|
||||
|
||||
### Schema Change
|
||||
|
||||
Add `horariosMisasId String? @unique` to Church model (same pattern as `philmassId`, `massSchedulesPhId`). Update church matcher and all existing importers.
|
||||
|
||||
### URL Structure
|
||||
|
||||
- Sitemap index: `/sitemap_index.xml` → 20 post sitemaps
|
||||
- Church pages: `/{province}/{city}/{church-slug}/`
|
||||
- Non-church posts (filtered out): `/misas-diarias/`, `/santos-del-dia/`, `/oraciones/`, etc.
|
||||
|
||||
### HTML Parsing
|
||||
|
||||
- **Name:** `<h1>Church Name (City)</h1>` — strip `(City)` suffix
|
||||
- **Address:** `<p>📌 <strong>Street, PostalCode City (Province)</strong></p>`
|
||||
- **Phone:** `<strong>Teléfono:</strong> <a href="tel:...">...</a>`
|
||||
- **Website:** `<strong>Página Web:</strong> <a href="...">...</a>`
|
||||
- **Schedule:** `<table>` with `DÍA`/`HORARIO` columns
|
||||
- Two seasonal tables: `☀️ Horario de verano` and `⛄ Misas en invierno`
|
||||
- Import seasonally appropriate one (Oct-May = winter, Jun-Sep = summer)
|
||||
- Day names: Lunes, Martes, Miércoles, Jueves, Viernes, Sábado, Domingos y Festivos
|
||||
- Day ranges: "Lunes a Viernes" (Monday-Friday)
|
||||
- Time format: `HH:MMh` (24-hour), multiple per cell via `<br>`
|
||||
- Annotations stripped: `(familias)`, etc.
|
||||
|
||||
### Matching Strategy
|
||||
|
||||
1. `horariosMisasId` exact match (for re-imports)
|
||||
2. Name + proximity against existing Spanish churches (from OSM)
|
||||
3. Unmatched: create new church with address, country=ES, no coordinates
|
||||
|
||||
### CLI
|
||||
|
||||
```
|
||||
npx tsx scripts/import-horariosmisas.ts --all
|
||||
npx tsx scripts/import-horariosmisas.ts --all --dry-run
|
||||
npx tsx scripts/import-horariosmisas.ts --province madrid
|
||||
npx tsx scripts/import-horariosmisas.ts --all --geocode
|
||||
npx tsx scripts/import-horariosmisas.ts --geocode-only
|
||||
npx tsx scripts/import-horariosmisas.ts --all --resume-from 5000
|
||||
```
|
||||
|
||||
### Rate Limiting
|
||||
|
||||
- Import: 1.5s between requests (~10,000 × 1.5s = ~4.2 hours)
|
||||
- Geocode: 1s between requests (Nominatim public API limit)
|
||||
|
||||
### Scheduler Integration
|
||||
|
||||
Add to PIPELINE_GROUPS imports group (sequential, after philmass-import).
|
||||
322
docs/plans/2026-02-26-horariosmisas-spain.md
Normal file
322
docs/plans/2026-02-26-horariosmisas-spain.md
Normal file
@@ -0,0 +1,322 @@
|
||||
# Spain Church Importer (horariosmisas.com) — Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Import ~10,000 Spanish churches with mass schedules from horariosmisas.com, with optional Nominatim forward geocoding for unmatched churches.
|
||||
|
||||
**Architecture:** Sitemap-driven importer. Fetch 20 post sitemaps for church URLs, parse static WordPress HTML for names/addresses/schedule tables, match against existing Spanish OSM churches, upsert with mass schedules. Separate geocoding pass via Nominatim public API.
|
||||
|
||||
**Tech Stack:** TypeScript, Prisma, HTML parsing (regex — no Playwright), Nominatim geocoding API.
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Add `horariosMisasId` to Prisma Schema
|
||||
|
||||
**Files:**
|
||||
- Modify: `prisma/schema.prisma`
|
||||
|
||||
**Step 1: Add field and index**
|
||||
|
||||
After the `philmassId` line (around line 38), add:
|
||||
|
||||
```prisma
|
||||
horariosMisasId String? @unique @map("horarios_misas_id") // horariosmisas.com URL slug
|
||||
```
|
||||
|
||||
And add an index in the `@@index` block (around line 78):
|
||||
|
||||
```prisma
|
||||
@@index([horariosMisasId])
|
||||
```
|
||||
|
||||
**Step 2: Push schema to NAS database**
|
||||
|
||||
```bash
|
||||
npx prisma db push --accept-data-loss
|
||||
```
|
||||
|
||||
Expected: `Your database is now in sync with your Prisma schema.`
|
||||
|
||||
**Step 3: Regenerate Prisma client**
|
||||
|
||||
```bash
|
||||
npx prisma generate
|
||||
```
|
||||
|
||||
**Step 4: Push schema to Neon production**
|
||||
|
||||
```bash
|
||||
npx prisma db push --url "$(grep DATABASE_URL .env.production | sed 's/DATABASE_URL="//' | sed 's/"$//')" --accept-data-loss
|
||||
```
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add prisma/schema.prisma
|
||||
git commit -m "feat: add horariosMisasId to Church model for Spain import"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Extend Church Matcher and Existing Importers
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/lib/church-matcher.ts`
|
||||
- Modify: `scripts/import-osm-churches.ts`
|
||||
- Modify: `scripts/import-gcatholic.ts`
|
||||
- Modify: `scripts/import-baidu-churches.ts`
|
||||
- Modify: `scripts/import-osm-region.ts`
|
||||
- Modify: `scripts/import-orarimesse.ts`
|
||||
- Modify: `scripts/import-mass-schedules-ph.ts`
|
||||
- Modify: `scripts/import-philmass.ts`
|
||||
|
||||
### Step 1: Update church-matcher.ts
|
||||
|
||||
In `ExistingChurch` interface (line ~11-26), add after `philmassId`:
|
||||
|
||||
```typescript
|
||||
horariosMisasId: string | null;
|
||||
```
|
||||
|
||||
In `ChurchCandidate` type (line ~113-122), add after `philmassId`:
|
||||
|
||||
```typescript
|
||||
horariosMisasId?: string;
|
||||
```
|
||||
|
||||
In `findDuplicateChurch()`, add a new pass after the fifth pass (philmassId match, line ~169-175). Before the proximity+name pass:
|
||||
|
||||
```typescript
|
||||
// Sixth pass: exact horariosMisasId match
|
||||
if (candidate.horariosMisasId) {
|
||||
const horariosMisasMatch = existingChurches.find(
|
||||
(church) => church.horariosMisasId === candidate.horariosMisasId
|
||||
);
|
||||
if (horariosMisasMatch) return horariosMisasMatch;
|
||||
}
|
||||
```
|
||||
|
||||
Update the comment on the proximity pass to say "Seventh pass".
|
||||
|
||||
### Step 2: Update all existing importers
|
||||
|
||||
In every importer that queries churches with a `select` clause containing `philmassId: true`, add:
|
||||
|
||||
```typescript
|
||||
horariosMisasId: true,
|
||||
```
|
||||
|
||||
In every importer that creates/pushes churches with `philmassId: null`, add:
|
||||
|
||||
```typescript
|
||||
horariosMisasId: null,
|
||||
```
|
||||
|
||||
**Files to update:** `import-osm-churches.ts`, `import-gcatholic.ts`, `import-baidu-churches.ts`, `import-osm-region.ts`, `import-orarimesse.ts`, `import-mass-schedules-ph.ts`, `import-philmass.ts`
|
||||
|
||||
### Step 3: Verify build
|
||||
|
||||
```bash
|
||||
npx tsc --noEmit
|
||||
```
|
||||
|
||||
Expected: No errors.
|
||||
|
||||
### Step 4: Commit
|
||||
|
||||
```bash
|
||||
git add src/lib/church-matcher.ts scripts/import-*.ts
|
||||
git commit -m "feat: add horariosMisasId to church matcher and all importers"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Create `import-horariosmisas.ts`
|
||||
|
||||
**Files:**
|
||||
- Create: `scripts/import-horariosmisas.ts`
|
||||
|
||||
### Architecture
|
||||
|
||||
This importer follows the exact same structure as `scripts/import-mass-schedules-ph.ts`. Key differences:
|
||||
|
||||
- **Sitemap:** Fetches 20 post sitemaps from sitemap index (not a single sitemap)
|
||||
- **URL filtering:** Church URLs have 3 path segments (`/{province}/{city}/{slug}/`). Non-church URLs (blog posts, daily readings) are filtered out.
|
||||
- **Schedule parsing:** Two seasonal tables (summer/winter). Import seasonally appropriate one based on current month.
|
||||
- **Day names:** Spanish (`Lunes`, `Martes`, etc.) with range support (`Lunes a Viernes`)
|
||||
- **Times:** 24-hour `HH:MMh` format (e.g., `08:00h`, `20:30h`)
|
||||
- **No coordinates:** Churches created with `latitude: 0, longitude: 0` — geocoded separately
|
||||
- **Geocoding:** Optional `--geocode` flag uses Nominatim public API (1 req/sec)
|
||||
|
||||
### Constants
|
||||
|
||||
```typescript
|
||||
const SITE_BASE = 'https://horariosmisas.com';
|
||||
const SITEMAP_INDEX_URL = `${SITE_BASE}/sitemap_index.xml`;
|
||||
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||
const REQUEST_DELAY_MS = 1500;
|
||||
const NOMINATIM_DELAY_MS = 1100;
|
||||
const NOMINATIM_URL = 'https://nominatim.openstreetmap.org/search';
|
||||
```
|
||||
|
||||
### Spanish Day Mapping
|
||||
|
||||
```typescript
|
||||
const DAY_MAP: Record<string, number[]> = {
|
||||
'domingos y festivos': [0],
|
||||
'domingos': [0],
|
||||
'domingo': [0],
|
||||
'lunes': [1],
|
||||
'martes': [2],
|
||||
'miércoles': [3],
|
||||
'miercoles': [3],
|
||||
'jueves': [4],
|
||||
'viernes': [5],
|
||||
'sábado': [6],
|
||||
'sabado': [6],
|
||||
'sábados': [6],
|
||||
'sabados': [6],
|
||||
};
|
||||
```
|
||||
|
||||
### Sitemap Fetching
|
||||
|
||||
1. Fetch sitemap index → extract `post-sitemap*.xml` URLs
|
||||
2. Fetch each post sitemap → extract URLs with exactly 3 path segments
|
||||
3. Filter out non-church URLs (patterns: `/misas-diarias/`, `/santos-del-dia/`, `/oraciones/`, `/noticias/`, `/blog/`, `/contacto/`, `/aviso-legal/`, `/politica-de-privacidad/`, `/politica-de-cookies/`)
|
||||
4. Deduplicate by slug
|
||||
|
||||
### HTML Parsing
|
||||
|
||||
**Church name:** `<h1>Church Name (City)</h1>` → strip `(City)` suffix
|
||||
|
||||
**Address:** `📌 <strong>Calle Goya, 26 28001 Madrid (Madrid)</strong>` → extract street, postal code (5-digit `\b\d{5}\b`), city (text after postal code), strip `(Province)` suffix
|
||||
|
||||
**Phone:** `<strong>Teléfono:</strong> <a href="tel:...">number</a>`
|
||||
|
||||
**Website:** `<strong>Página Web:</strong> <a href="url">...</a>`
|
||||
|
||||
**Schedule tables:** Find `<table>` elements with DÍA/HORARIO headers. Split by seasonal headings (☀️ verano / ⛄ invierno). Pick seasonally appropriate section (Oct-May = winter, Jun-Sep = summer). Parse `<td>` cells: first cell = day name(s), second cell = times. Times in `HH:MMh` format extracted via regex `(\d{1,2}):(\d{2})\s*h?`.
|
||||
|
||||
### Day Range Resolution
|
||||
|
||||
Support ranges like `Lunes a Viernes` → [1,2,3,4,5] and compound entries like `Lunes, Miércoles y Viernes` → [1,3,5].
|
||||
|
||||
### Geocoding (--geocode / --geocode-only)
|
||||
|
||||
Query Nominatim with: `{address}, Spain` → fallback to `{postalCode} {city}, Spain` → fallback to `{city}, Spain`. Use `countrycodes=es` parameter. Max 1 req/sec.
|
||||
|
||||
### Matching Strategy
|
||||
|
||||
1. `horariosMisasId` exact match (primary — for re-imports)
|
||||
2. Name + proximity against existing Spanish OSM churches (secondary)
|
||||
3. Unmatched: create new church with `latitude: 0, longitude: 0`, country=ES
|
||||
|
||||
### CLI
|
||||
|
||||
```
|
||||
--all Import all churches from sitemaps
|
||||
--province <name> Import only churches from this province
|
||||
--dry-run No database writes
|
||||
--geocode After import, geocode unmatched churches
|
||||
--geocode-only Only geocode (skip import)
|
||||
--resume-from <n> Skip first N churches
|
||||
--job-id <uuid> Background job tracking
|
||||
```
|
||||
|
||||
### Mass Schedule Language
|
||||
|
||||
Set `language: 'Spanish'` on all created mass schedules.
|
||||
|
||||
### Step 1: Create the file
|
||||
|
||||
Use `scripts/import-mass-schedules-ph.ts` as the structural template. Implement all functions described above.
|
||||
|
||||
### Step 2: Verify build
|
||||
|
||||
```bash
|
||||
npx tsc --noEmit
|
||||
```
|
||||
|
||||
### Step 3: Dry-run test
|
||||
|
||||
```bash
|
||||
npx tsx scripts/import-horariosmisas.ts --province navarra --dry-run
|
||||
```
|
||||
|
||||
### Step 4: Commit
|
||||
|
||||
```bash
|
||||
git add scripts/import-horariosmisas.ts
|
||||
git commit -m "feat: add horariosmisas.com Spain church importer"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Add to Scheduler Pipeline and npm Scripts
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/scheduler.ts`
|
||||
- Modify: `package.json`
|
||||
|
||||
### Step 1: Add to PIPELINE_GROUPS
|
||||
|
||||
In `scripts/scheduler.ts`, in the `imports` group (line ~40-51), add after the `philmass-import` entry:
|
||||
|
||||
```typescript
|
||||
{ name: 'horariosmisas-import', type: 'horariosmisas-import', config: {} },
|
||||
```
|
||||
|
||||
### Step 2: Add getJobCommand case
|
||||
|
||||
In the `getJobCommand` function (around line ~182), before the `default:` case, add:
|
||||
|
||||
```typescript
|
||||
case 'horariosmisas-import': {
|
||||
const args = ['tsx', 'scripts/import-horariosmisas.ts', '--all', '--geocode'];
|
||||
if (config?.province) args.push('--province', String(config.province));
|
||||
if (config?.resumeFrom) args.push('--resume-from', String(config.resumeFrom));
|
||||
return { command: 'npx', args };
|
||||
}
|
||||
```
|
||||
|
||||
### Step 3: Add npm scripts
|
||||
|
||||
In `package.json`, add after the `"import:philmass"` line:
|
||||
|
||||
```json
|
||||
"import:horariosmisas": "tsx scripts/import-horariosmisas.ts",
|
||||
```
|
||||
|
||||
### Step 4: Verify build
|
||||
|
||||
```bash
|
||||
npx tsc --noEmit
|
||||
```
|
||||
|
||||
### Step 5: Commit
|
||||
|
||||
```bash
|
||||
git add scripts/scheduler.ts package.json
|
||||
git commit -m "feat: add horariosmisas import to scheduler pipeline"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Verification
|
||||
|
||||
1. **Dry run on single province**: `npx tsx scripts/import-horariosmisas.ts --province navarra --dry-run`
|
||||
- Verify: church names parsed correctly, schedules extracted, matches found
|
||||
2. **Dry run on Madrid**: `npx tsx scripts/import-horariosmisas.ts --province madrid --dry-run`
|
||||
- Verify: larger province, summer/winter schedule selection, address parsing
|
||||
3. **Single province real import**: `npx tsx scripts/import-horariosmisas.ts --province navarra`
|
||||
- Verify: churches created/updated, mass schedules in database
|
||||
4. **Geocode test**: `npx tsx scripts/import-horariosmisas.ts --geocode-only --dry-run`
|
||||
- Verify: finds churches needing geocoding, Nominatim returns coordinates
|
||||
5. **Full import**: `npx tsx scripts/import-horariosmisas.ts --all --geocode`
|
||||
|
||||
## Runtime Estimate
|
||||
|
||||
- Sitemap fetch: 20 sitemaps x 1.5s = ~30s
|
||||
- Import: ~10,000 churches x 1.5s = ~4.2 hours
|
||||
- Geocode: depends on unmatched count x 1.1s
|
||||
103
docs/plans/2026-03-01-weekdaymasses-importer-design.md
Normal file
103
docs/plans/2026-03-01-weekdaymasses-importer-design.md
Normal file
@@ -0,0 +1,103 @@
|
||||
# weekdaymasses.org.uk Global Importer
|
||||
|
||||
## Context
|
||||
|
||||
weekdaymasses.org.uk is a UK-based Catholic directory covering ~3,500-4,000 churches globally with mass schedules, coordinates, addresses, and phone numbers. Covers GB, Ireland, and 49+ international countries (India, Sri Lanka, South Korea, Japan, and more). All data served on single HTML pages per area — no pagination or API needed.
|
||||
|
||||
## Data Source
|
||||
|
||||
Three area pages cover the entire site:
|
||||
|
||||
| Page | URL | Est. Churches |
|
||||
|------|-----|---------------|
|
||||
| GB | `/en/area/gb/churches` | ~3,000+ |
|
||||
| Ireland | `/en/area/ireland/churches` | ~300+ |
|
||||
| Outside GB | `/en/area/outside-gb/churches` | ~152+ |
|
||||
|
||||
Individual country/region pages (e.g. `/en/area/india/churches`) are subsets of these three.
|
||||
|
||||
### Data per church
|
||||
|
||||
- **Name**: h3 heading, format "Church Name (Location)"
|
||||
- **Address**: plain text after mass times, with postal/zip code
|
||||
- **Coordinates**: in map link query params `lat=XX.XXXX&lon=YY.YYYY&church_id=NNNNN`
|
||||
- **Mass times**: format `Day: HH.MMam/pm(Language), HH.MMam/pm(Language)`
|
||||
- **Phone**: `Tel: +XX XXXX XXXXXX`
|
||||
- **Website**: occasional links
|
||||
- **church_id**: unique numeric identifier in map links
|
||||
|
||||
### Mass time format
|
||||
|
||||
```
|
||||
Sunday: 6.30am(Tamil), 8.30am(Tamil), 5.30pm(English)
|
||||
Mon Tue Wed Thu Fri: 6.30am(Tamil)
|
||||
Saturday: 6.30am(Tamil), 5.30pm(English)
|
||||
```
|
||||
|
||||
Day labels: `Sunday`, `Mon`, `Tue`, `Wed`, `Thu`, `Fri`, `Saturday`, or combinations like `Mon Tue Wed Thu Fri`. Also `Holy Day` entries.
|
||||
|
||||
Time format: `H.MMam/pm` — needs conversion to 24h `HH:MM`.
|
||||
|
||||
Language in parentheses maps to our `language` field on mass_schedules.
|
||||
|
||||
### Country detection
|
||||
|
||||
The address is the last line of each church entry. Country can be detected by:
|
||||
- GB: UK postal code pattern (e.g. `SW1A 1AA`)
|
||||
- Ireland: Irish Eircode (e.g. `D01 F5P2`) or "Ireland" in address
|
||||
- India: 6-digit postal code (e.g. `600088`)
|
||||
- Others: country name at end of address, or fallback to the area page being scraped
|
||||
|
||||
## Design
|
||||
|
||||
### Schema
|
||||
|
||||
Add to Church model in both BethelGuide and ScraperControl:
|
||||
|
||||
```prisma
|
||||
weekdayMassesId String? @unique @map("weekday_masses_id")
|
||||
@@index([weekdayMassesId])
|
||||
```
|
||||
|
||||
### Script: `scripts/import-weekdaymasses.ts`
|
||||
|
||||
Single script that:
|
||||
|
||||
1. Fetches area pages (default: all 3; filterable with `--area gb|ireland|outside-gb|india|...`)
|
||||
2. Parses HTML into structured church entries
|
||||
3. Converts mass times from `H.MMam/pm` to `HH:MM` 24h format
|
||||
4. Detects country from address patterns
|
||||
5. Matches against existing churches by `weekdayMassesId` (exact) then proximity+name
|
||||
6. Upserts churches and replaces mass schedules
|
||||
|
||||
### HTML parsing strategy
|
||||
|
||||
Each church is a block between consecutive h3 headings. Within each block:
|
||||
- h3 content = church name
|
||||
- Lines with day labels + times = mass schedule
|
||||
- Map link = coordinates + church_id
|
||||
- Last text block before next h3 = address
|
||||
- `Tel:` prefix = phone
|
||||
|
||||
### CLI flags
|
||||
|
||||
- `--all` — import all 3 area pages
|
||||
- `--area <name>` — import specific area (gb, ireland, outside-gb, india, sri-lanka, etc.)
|
||||
- `--dry-run` — no database writes
|
||||
- `--resume-from <n>` — skip first N churches
|
||||
- `--job-id <uuid>` — background job tracking
|
||||
|
||||
### Church matcher integration
|
||||
|
||||
Add `weekdayMassesId` to `ExistingChurch`, `ChurchCandidate`, and a new match pass in `findDuplicateChurch()`.
|
||||
|
||||
### Scheduler integration
|
||||
|
||||
Add `weekdaymasses-import` to the sequential imports group in the pipeline, with `getJobCommand()` case and npm script.
|
||||
|
||||
## Scope
|
||||
|
||||
- ~3,500-4,000 churches with mass schedules
|
||||
- Most GB/Ireland churches already in DB from OSM (will match and add schedules)
|
||||
- India/Sri Lanka/international churches partially in DB from OSM/gcatholic
|
||||
- Value: mass schedule data for thousands of churches that currently have none
|
||||
6
next-env.d.ts
vendored
Normal file
6
next-env.d.ts
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// <reference types="next" />
|
||||
/// <reference types="next/image-types/global" />
|
||||
import "./.next/types/routes.d.ts";
|
||||
|
||||
// NOTE: This file should not be edited
|
||||
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
|
||||
9
next.config.ts
Normal file
9
next.config.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import type { NextConfig } from 'next';
|
||||
|
||||
const nextConfig: NextConfig = {
|
||||
output: 'standalone',
|
||||
poweredByHeader: false,
|
||||
reactStrictMode: true,
|
||||
};
|
||||
|
||||
export default nextConfig;
|
||||
8677
package-lock.json
generated
Normal file
8677
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
8
postcss.config.mjs
Normal file
8
postcss.config.mjs
Normal file
@@ -0,0 +1,8 @@
|
||||
/** @type {import('postcss-load-config').Config} */
|
||||
const config = {
|
||||
plugins: {
|
||||
'@tailwindcss/postcss': {},
|
||||
},
|
||||
};
|
||||
|
||||
export default config;
|
||||
@@ -46,6 +46,7 @@ model Church {
|
||||
gottesdienstzeitenId String? @unique @map("gottesdienstzeiten_id")
|
||||
kerknetId String? @unique @map("kerknet_id")
|
||||
buscarmisasNetworkId String? @unique @map("buscarmisas_network_id")
|
||||
gcatholicId String? @unique @map("gcatholic_id")
|
||||
claimed Boolean @default(false)
|
||||
claimedAt DateTime? @map("claimed_at")
|
||||
lastScrapedAt DateTime? @map("last_scraped_at")
|
||||
@@ -59,6 +60,7 @@ model Church {
|
||||
googleSearchedAt DateTime? @map("google_searched_at") // When Google Places enrichment was attempted
|
||||
createdAt DateTime @default(now()) @map("created_at")
|
||||
updatedAt DateTime @updatedAt @map("updated_at")
|
||||
parochiaSlug String? @map("parochia_slug")
|
||||
|
||||
dioceseId String? @map("diocese_id")
|
||||
|
||||
@@ -99,6 +101,7 @@ model Church {
|
||||
@@index([gottesdienstzeitenId])
|
||||
@@index([kerknetId])
|
||||
@@index([buscarmisasNetworkId])
|
||||
@@index([gcatholicId])
|
||||
@@index([dioceseId])
|
||||
@@index([claimedByUserId])
|
||||
@@map("churches")
|
||||
|
||||
165
scripts/debug/analyze-enrichment-priority.ts
Normal file
165
scripts/debug/analyze-enrichment-priority.ts
Normal file
@@ -0,0 +1,165 @@
|
||||
import { config } from 'dotenv';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
|
||||
// Load .env.local first, then .env
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
throw new Error('DATABASE_URL environment variable is not set');
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
interface CountryStats {
|
||||
country: string;
|
||||
totalChurches: number;
|
||||
withWebsite: number;
|
||||
withoutWebsite: number;
|
||||
websitePercent: number;
|
||||
needEnrichment: number;
|
||||
priority: number;
|
||||
}
|
||||
|
||||
async function analyzeEnrichmentPriority() {
|
||||
try {
|
||||
console.log('Analyzing enrichment priority by country...\n');
|
||||
|
||||
// Get all OSM churches grouped by country
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
source: 'osm',
|
||||
},
|
||||
select: {
|
||||
country: true,
|
||||
hasWebsite: true,
|
||||
website: true,
|
||||
},
|
||||
});
|
||||
|
||||
// Group by country and calculate stats
|
||||
const byCountry = churches.reduce((acc, church) => {
|
||||
const country = church.country || 'Unknown';
|
||||
if (!acc[country]) {
|
||||
acc[country] = {
|
||||
country,
|
||||
totalChurches: 0,
|
||||
withWebsite: 0,
|
||||
withoutWebsite: 0,
|
||||
websitePercent: 0,
|
||||
needEnrichment: 0,
|
||||
priority: 0,
|
||||
};
|
||||
}
|
||||
|
||||
acc[country].totalChurches++;
|
||||
if (church.hasWebsite || church.website) {
|
||||
acc[country].withWebsite++;
|
||||
} else {
|
||||
acc[country].withoutWebsite++;
|
||||
acc[country].needEnrichment++;
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, {} as Record<string, CountryStats>);
|
||||
|
||||
// Calculate percentages and priority score
|
||||
const stats = Object.values(byCountry).map((stat) => {
|
||||
stat.websitePercent = (stat.withWebsite / stat.totalChurches) * 100;
|
||||
|
||||
// Priority formula:
|
||||
// - Weight heavily on churches needing enrichment (80%)
|
||||
// - Weight on low website coverage (20%)
|
||||
// This favors large countries with low coverage
|
||||
const needWeight = stat.needEnrichment / 1000; // Normalize to thousands
|
||||
const coverageGap = 100 - stat.websitePercent; // How much coverage is missing
|
||||
stat.priority = needWeight * 0.8 + (coverageGap / 100) * needWeight * 0.2;
|
||||
|
||||
return stat;
|
||||
});
|
||||
|
||||
// Sort by priority (highest first)
|
||||
stats.sort((a, b) => b.priority - a.priority);
|
||||
|
||||
// Display results
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('ENRICHMENT PRIORITY RANKING');
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('');
|
||||
console.log('Priority formula: (churches_needing_enrichment * 0.8) + (coverage_gap * 0.2)');
|
||||
console.log('This favors countries with many churches and low website coverage.');
|
||||
console.log('');
|
||||
console.log('Rank | Country | Total | Need Enrichment | Coverage | Priority Score');
|
||||
console.log('─────┼─────────┼───────┼────────────────┼──────────┼────────────────');
|
||||
|
||||
stats.forEach((stat, index) => {
|
||||
const rank = String(index + 1).padStart(4);
|
||||
const country = stat.country.padEnd(7);
|
||||
const total = String(stat.totalChurches).padStart(5);
|
||||
const need = String(stat.needEnrichment).padStart(15);
|
||||
const coverage = `${stat.websitePercent.toFixed(1)}%`.padStart(8);
|
||||
const priority = stat.priority.toFixed(2).padStart(14);
|
||||
|
||||
console.log(`${rank} | ${country} | ${total} | ${need} | ${coverage} | ${priority}`);
|
||||
});
|
||||
|
||||
console.log('');
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('');
|
||||
|
||||
// Show top 10 with details
|
||||
console.log('TOP 10 COUNTRIES TO PRIORITIZE:');
|
||||
console.log('');
|
||||
|
||||
stats.slice(0, 10).forEach((stat, index) => {
|
||||
console.log(`${index + 1}. ${stat.country}`);
|
||||
console.log(` Total churches: ${stat.totalChurches.toLocaleString()}`);
|
||||
console.log(` Need enrichment: ${stat.needEnrichment.toLocaleString()} (${(100 - stat.websitePercent).toFixed(1)}% missing)`);
|
||||
console.log(` Current coverage: ${stat.websitePercent.toFixed(1)}%`);
|
||||
console.log(` Priority score: ${stat.priority.toFixed(2)}`);
|
||||
console.log('');
|
||||
});
|
||||
|
||||
// Calculate enrichment timeline
|
||||
const totalNeedEnrichment = stats.reduce((sum, s) => sum + s.needEnrichment, 0);
|
||||
const daysAtFullSpeed = Math.ceil(totalNeedEnrichment / 390);
|
||||
const monthsAtFullSpeed = (daysAtFullSpeed / 30).toFixed(1);
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('ENRICHMENT TIMELINE');
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log(`Total churches needing enrichment: ${totalNeedEnrichment.toLocaleString()}`);
|
||||
console.log(`At 390 churches/day (free tier): ${daysAtFullSpeed} days (~${monthsAtFullSpeed} months)`);
|
||||
console.log('');
|
||||
|
||||
// Output country priority order for the script
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('COUNTRY PRIORITY ORDER (for enrichment script)');
|
||||
console.log('═══════════════════════════════════════════════════════════════════════════');
|
||||
console.log('');
|
||||
console.log('const COUNTRY_PRIORITY = [');
|
||||
stats
|
||||
.filter((s) => s.needEnrichment > 0)
|
||||
.forEach((stat, index) => {
|
||||
const comma = index < stats.filter((s) => s.needEnrichment > 0).length - 1 ? ',' : '';
|
||||
console.log(` '${stat.country}'${comma} // ${stat.needEnrichment.toLocaleString()} churches`);
|
||||
});
|
||||
console.log('];');
|
||||
console.log('');
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
analyzeEnrichmentPriority();
|
||||
66
scripts/debug/check-2-real-bugs.ts
Normal file
66
scripts/debug/check-2-real-bugs.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Check the 2 potentially real bugs
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function checkRealBugs() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
console.log('=== 1. Iglesia de San Fernando (trying Spanish page) ===\n');
|
||||
|
||||
scraper.setCountry('ES');
|
||||
const spanishUrl = 'https://www.parroquiasanfernandomaspalomas.net/'; // Remove /de/
|
||||
const result1 = await scraper.scrape(spanishUrl);
|
||||
|
||||
console.log(`URL: ${spanishUrl}`);
|
||||
console.log(`Success: ${result1.success}`);
|
||||
console.log(`Schedules: ${result1.schedules.length}`);
|
||||
console.log(`Error: ${result1.error || 'none'}\n`);
|
||||
|
||||
if (result1.schedules.length > 0) {
|
||||
console.log('Sample schedules:');
|
||||
result1.schedules.slice(0, 5).forEach(s => {
|
||||
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||||
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
|
||||
});
|
||||
}
|
||||
|
||||
console.log('\n=== 2. Kościół (Poland) ===\n');
|
||||
|
||||
scraper.setCountry('PL');
|
||||
const result2 = await scraper.scrape('http://parafialubojna.pl');
|
||||
|
||||
console.log(`Success: ${result2.success}`);
|
||||
console.log(`Schedules: ${result2.schedules.length}`);
|
||||
console.log(`Error: ${result2.error || 'none'}\n`);
|
||||
|
||||
if (result2.schedules.length > 0) {
|
||||
console.log('Sample schedules:');
|
||||
result2.schedules.slice(0, 5).forEach(s => {
|
||||
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||||
console.log(` ${days[s.dayOfWeek]} ${s.time} - ${s.language} ${s.massType}`);
|
||||
});
|
||||
} else if (result2.rawHtml) {
|
||||
const text = result2.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Look for Polish schedule keywords
|
||||
const scheduleIndex = text.indexOf('msze') || text.indexOf('msza') || text.indexOf('nabożeńst');
|
||||
if (scheduleIndex !== -1) {
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
|
||||
console.log('Found schedule section:');
|
||||
console.log(snippet);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
checkRealBugs().catch(console.error);
|
||||
79
scripts/debug/check-enrichment-detail.ts
Normal file
79
scripts/debug/check-enrichment-detail.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
import { Pool } from 'pg';
|
||||
import * as dotenv from 'dotenv';
|
||||
import * as path from 'path';
|
||||
|
||||
// Load .env.local first (takes precedence), then .env
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
async function checkEnrichmentDetail() {
|
||||
try {
|
||||
console.log('Connecting to database...\n');
|
||||
|
||||
// Check churches awaiting enrichment
|
||||
const pendingResult = await pool.query(`
|
||||
SELECT
|
||||
country,
|
||||
COUNT(*) as pending_count
|
||||
FROM churches
|
||||
WHERE google_place_id IS NULL
|
||||
GROUP BY country
|
||||
ORDER BY pending_count DESC
|
||||
LIMIT 20;
|
||||
`);
|
||||
|
||||
console.log('=== Churches Awaiting Enrichment (Top 20 Countries) ===');
|
||||
let totalPending = 0;
|
||||
pendingResult.rows.forEach((row) => {
|
||||
console.log(`${row.country}: ${row.pending_count} churches`);
|
||||
totalPending += parseInt(row.pending_count);
|
||||
});
|
||||
console.log(`\nTotal pending shown: ${totalPending}`);
|
||||
|
||||
// Check total stats
|
||||
const statsResult = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total_churches,
|
||||
COUNT(CASE WHEN google_place_id IS NOT NULL THEN 1 END) as enriched,
|
||||
COUNT(CASE WHEN google_place_id IS NULL THEN 1 END) as pending
|
||||
FROM churches;
|
||||
`);
|
||||
|
||||
console.log('\n=== Overall Stats ===');
|
||||
console.log(`Total churches: ${statsResult.rows[0].total_churches}`);
|
||||
console.log(`Enriched: ${statsResult.rows[0].enriched} (${((statsResult.rows[0].enriched / statsResult.rows[0].total_churches) * 100).toFixed(2)}%)`);
|
||||
console.log(`Pending: ${statsResult.rows[0].pending} (${((statsResult.rows[0].pending / statsResult.rows[0].total_churches) * 100).toFixed(2)}%)`);
|
||||
|
||||
// Check enrichment rate
|
||||
const rateResult = await pool.query(`
|
||||
SELECT
|
||||
DATE(updated_at) as date,
|
||||
COUNT(*) as enriched_count
|
||||
FROM churches
|
||||
WHERE google_place_id IS NOT NULL
|
||||
AND updated_at > NOW() - INTERVAL '7 days'
|
||||
GROUP BY DATE(updated_at)
|
||||
ORDER BY date DESC;
|
||||
`);
|
||||
|
||||
console.log('\n=== Enrichment Activity (Last 7 Days) ===');
|
||||
if (rateResult.rows.length === 0) {
|
||||
console.log('No enrichment activity in the last 7 days');
|
||||
} else {
|
||||
rateResult.rows.forEach((row) => {
|
||||
console.log(`${row.date}: ${row.enriched_count} churches`);
|
||||
});
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error checking enrichment detail:', error);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkEnrichmentDetail();
|
||||
146
scripts/debug/check-enrichment-status.ts
Normal file
146
scripts/debug/check-enrichment-status.ts
Normal file
@@ -0,0 +1,146 @@
|
||||
import { config } from 'dotenv';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
|
||||
// Load .env.local first, then .env
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
throw new Error('DATABASE_URL environment variable is not set');
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function checkEnrichmentStatus() {
|
||||
try {
|
||||
console.log('Checking enrichment status...\n');
|
||||
|
||||
// Overall stats
|
||||
const totalOSM = await prisma.church.count({
|
||||
where: { source: 'osm' },
|
||||
});
|
||||
|
||||
const enriched = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
googlePlaceId: { not: null },
|
||||
},
|
||||
});
|
||||
|
||||
const withWebsite = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
hasWebsite: true,
|
||||
},
|
||||
});
|
||||
|
||||
const needEnrichment = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
hasWebsite: false,
|
||||
website: null,
|
||||
},
|
||||
});
|
||||
|
||||
// Recently enriched (last 24 hours)
|
||||
const yesterday = new Date();
|
||||
yesterday.setDate(yesterday.getDate() - 1);
|
||||
|
||||
const recentlyEnriched = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
googlePlaceId: { not: null },
|
||||
updatedAt: { gte: yesterday },
|
||||
},
|
||||
});
|
||||
|
||||
// Get top 10 priority countries status
|
||||
const PRIORITY_COUNTRIES = ['FR', 'DE', 'ES', 'PL', 'BR', 'PT', 'PH', 'CZ', 'MX', 'HU'];
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('OVERALL ENRICHMENT STATUS');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(`Total OSM churches: ${totalOSM.toLocaleString()}`);
|
||||
console.log(`Churches with Google Place ID: ${enriched.toLocaleString()} (${((enriched / totalOSM) * 100).toFixed(2)}%)`);
|
||||
console.log(`Churches with websites: ${withWebsite.toLocaleString()} (${((withWebsite / totalOSM) * 100).toFixed(2)}%)`);
|
||||
console.log(`Need enrichment: ${needEnrichment.toLocaleString()} (${((needEnrichment / totalOSM) * 100).toFixed(2)}%)`);
|
||||
console.log('');
|
||||
console.log(`Recently enriched (24h): ${recentlyEnriched.toLocaleString()}`);
|
||||
console.log('');
|
||||
|
||||
// Priority countries breakdown
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('TOP 10 PRIORITY COUNTRIES STATUS');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('');
|
||||
|
||||
for (const country of PRIORITY_COUNTRIES) {
|
||||
const total = await prisma.church.count({
|
||||
where: { source: 'osm', country },
|
||||
});
|
||||
|
||||
const countryEnriched = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
country,
|
||||
googlePlaceId: { not: null },
|
||||
},
|
||||
});
|
||||
|
||||
const countryWithWebsite = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
country,
|
||||
OR: [
|
||||
{ hasWebsite: true },
|
||||
{ googlePlaceId: { not: null } },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
const countryNeedEnrichment = await prisma.church.count({
|
||||
where: {
|
||||
source: 'osm',
|
||||
country,
|
||||
hasWebsite: false,
|
||||
website: null,
|
||||
},
|
||||
});
|
||||
|
||||
const websitePercent = (countryWithWebsite / total) * 100;
|
||||
const enrichedPercent = (countryEnriched / total) * 100;
|
||||
|
||||
console.log(`${country.padEnd(4)} | Total: ${String(total).padStart(6)} | Enriched: ${String(countryEnriched).padStart(5)} (${enrichedPercent.toFixed(1)}%) | With Website: ${String(countryWithWebsite).padStart(5)} (${websitePercent.toFixed(1)}%) | Need: ${String(countryNeedEnrichment).padStart(6)}`);
|
||||
}
|
||||
|
||||
console.log('');
|
||||
|
||||
// Estimate timeline
|
||||
const daysRemaining = Math.ceil(needEnrichment / 390);
|
||||
const monthsRemaining = (daysRemaining / 30).toFixed(1);
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('TIMELINE ESTIMATE');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(`At 390 churches/day:`);
|
||||
console.log(` Days remaining: ${daysRemaining} days`);
|
||||
console.log(` Months remaining: ~${monthsRemaining} months`);
|
||||
console.log(` Estimated completion: ${new Date(Date.now() + daysRemaining * 24 * 60 * 60 * 1000).toLocaleDateString()}`);
|
||||
console.log('');
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkEnrichmentStatus();
|
||||
78
scripts/debug/check-enrichment.ts
Normal file
78
scripts/debug/check-enrichment.ts
Normal file
@@ -0,0 +1,78 @@
|
||||
import { Pool } from 'pg';
|
||||
import * as dotenv from 'dotenv';
|
||||
import * as path from 'path';
|
||||
|
||||
// Load .env.local first (takes precedence), then .env
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
async function checkEnrichment() {
|
||||
try {
|
||||
console.log('Connecting to database...');
|
||||
|
||||
// Check total enriched churches
|
||||
const totalResult = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total_enriched,
|
||||
COUNT(CASE WHEN updated_at > NOW() - INTERVAL '24 hours' THEN 1 END) as enriched_last_24h,
|
||||
MAX(updated_at) as last_enrichment
|
||||
FROM churches
|
||||
WHERE google_place_id IS NOT NULL;
|
||||
`);
|
||||
|
||||
console.log('\n=== Google Enrichment Summary ===');
|
||||
console.log(`Total churches with Google Place ID: ${totalResult.rows[0].total_enriched}`);
|
||||
console.log(`Enriched in last 24 hours: ${totalResult.rows[0].enriched_last_24h}`);
|
||||
console.log(`Last enrichment: ${totalResult.rows[0].last_enrichment}`);
|
||||
|
||||
// Check by country
|
||||
const countryResult = await pool.query(`
|
||||
SELECT
|
||||
country,
|
||||
COUNT(*) as enriched_count,
|
||||
COUNT(CASE WHEN updated_at > NOW() - INTERVAL '24 hours' THEN 1 END) as enriched_last_24h
|
||||
FROM churches
|
||||
WHERE google_place_id IS NOT NULL
|
||||
GROUP BY country
|
||||
ORDER BY enriched_last_24h DESC
|
||||
LIMIT 10;
|
||||
`);
|
||||
|
||||
console.log('\n=== Top Countries Enriched (Last 24h) ===');
|
||||
countryResult.rows.forEach((row) => {
|
||||
console.log(`${row.country}: ${row.enriched_last_24h} new / ${row.enriched_count} total`);
|
||||
});
|
||||
|
||||
// Check recent enrichments with details
|
||||
const recentResult = await pool.query(`
|
||||
SELECT
|
||||
name,
|
||||
city,
|
||||
country,
|
||||
google_place_id,
|
||||
updated_at
|
||||
FROM churches
|
||||
WHERE google_place_id IS NOT NULL
|
||||
AND updated_at > NOW() - INTERVAL '24 hours'
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 20;
|
||||
`);
|
||||
|
||||
console.log('\n=== Recent Enrichments (Last 24h, sample) ===');
|
||||
recentResult.rows.forEach((row) => {
|
||||
const timestamp = row.updated_at ? new Date(row.updated_at).toISOString() : 'unknown';
|
||||
console.log(`${row.name}, ${row.city}, ${row.country} - ${timestamp}`);
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error checking enrichment:', error);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkEnrichment();
|
||||
45
scripts/debug/check-german-office-hours.ts
Normal file
45
scripts/debug/check-german-office-hours.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Check the full section text for German church to understand office hours pattern
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function checkGerman() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find Monday section
|
||||
const montagIndex = text.indexOf('montag');
|
||||
if (montagIndex !== -1) {
|
||||
const montagContext = text.substring(montagIndex, montagIndex + 200);
|
||||
console.log('=== Monday (Montag) context ===');
|
||||
console.log(montagContext);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Find Sunday section
|
||||
const sonntagIndex = text.indexOf('sonntag');
|
||||
if (sonntagIndex !== -1) {
|
||||
const sonntagContext = text.substring(sonntagIndex, sonntagIndex + 300);
|
||||
console.log('=== Sunday (Sonntag) context ===');
|
||||
console.log(sonntagContext);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
checkGerman().catch(console.error);
|
||||
51
scripts/debug/check-neon-poland.ts
Normal file
51
scripts/debug/check-neon-poland.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { config } from 'dotenv';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
// Load environment variables
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
async function main() {
|
||||
const connectionString = process.env.DATABASE_URL || '';
|
||||
console.log('DATABASE_URL:', connectionString.replace(/:[^:@]+@/, ':****@'));
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
console.log('PrismaClient created:', !!prisma);
|
||||
console.log('prisma.churches:', !!prisma.churches);
|
||||
|
||||
await prisma.$connect();
|
||||
|
||||
const count = await prisma.churches.count({ where: { country: 'PL' } });
|
||||
console.log(`Poland churches in Neon: ${count}`);
|
||||
|
||||
const withSchedules = await prisma.churches.count({
|
||||
where: {
|
||||
country: 'PL',
|
||||
massSchedules: { some: {} }
|
||||
}
|
||||
});
|
||||
console.log(`With mass schedules: ${withSchedules}`);
|
||||
|
||||
// Sample a few churches
|
||||
const sample = await prisma.churches.findMany({
|
||||
where: { country: 'PL' },
|
||||
include: { massSchedules: true },
|
||||
take: 3
|
||||
});
|
||||
|
||||
console.log('\nSample churches:');
|
||||
for (const church of sample) {
|
||||
console.log(` - ${church.name} (${church.city}): ${church.massSchedules.length} schedules`);
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
38
scripts/debug/check-niedziela-occurrences.ts
Normal file
38
scripts/debug/check-niedziela-occurrences.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function check() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('PL');
|
||||
|
||||
const result = await scraper.scrape('http://parafialubojna.pl');
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
const niedziela_matches = [];
|
||||
let idx = 0;
|
||||
while ((idx = text.indexOf('niedziela', idx)) !== -1) {
|
||||
niedziela_matches.push({
|
||||
position: idx,
|
||||
context: text.substring(Math.max(0, idx-30), idx+70)
|
||||
});
|
||||
idx++;
|
||||
}
|
||||
|
||||
console.log(`niedziela occurrences: ${niedziela_matches.length}\n`);
|
||||
niedziela_matches.forEach((m, i) => {
|
||||
console.log(`Occurrence ${i+1} at position ${m.position}:`);
|
||||
console.log(` "${m.context}"`);
|
||||
console.log('');
|
||||
});
|
||||
}
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
check();
|
||||
34
scripts/debug/check-osm-counts.ts
Normal file
34
scripts/debug/check-osm-counts.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
async function main() {
|
||||
const totalRes = await pool.query(`SELECT COUNT(*) as total FROM churches WHERE source = 'osm'`);
|
||||
console.log('Total OSM churches:', totalRes.rows[0].total);
|
||||
|
||||
const countryRes = await pool.query(`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country IS NOT NULL GROUP BY country ORDER BY count DESC LIMIT 40`);
|
||||
console.log('\nTop 40 countries by OSM church count:');
|
||||
for (const row of countryRes.rows) {
|
||||
console.log(` ${row.country}: ${row.count}`);
|
||||
}
|
||||
|
||||
// Check key countries that were under-imported
|
||||
const keyCountries = ['AT','HR','UA','RO','LV','BY','RS','BA','MK','AL','EE','GE','AM','RU','IN','JP','CA','US','MX','AR','CO','ID','CN'];
|
||||
const keyRes = await pool.query(`SELECT country, COUNT(*) as count FROM churches WHERE source = 'osm' AND country = ANY($1) GROUP BY country ORDER BY count DESC`, [keyCountries]);
|
||||
console.log('\nKey countries to check (were under-imported):');
|
||||
const found = new Map(keyRes.rows.map((r: any) => [r.country, r.count]));
|
||||
for (const c of keyCountries) {
|
||||
console.log(` ${c}: ${found.get(c) || 0}`);
|
||||
}
|
||||
|
||||
// Total countries
|
||||
const countriesRes = await pool.query(`SELECT COUNT(DISTINCT country) as total FROM churches WHERE source = 'osm'`);
|
||||
console.log(`\nTotal countries with OSM data: ${countriesRes.rows[0].total}`);
|
||||
|
||||
await pool.end();
|
||||
}
|
||||
main();
|
||||
88
scripts/debug/check-production-db.ts
Executable file
88
scripts/debug/check-production-db.ts
Executable file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env tsx
|
||||
|
||||
/**
|
||||
* Check production database (Neon) for data
|
||||
* Run with: npx tsx scripts/check-production-db.ts
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { config } from 'dotenv';
|
||||
|
||||
// Load environment variables (.env.local overrides .env)
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
console.error('❌ DATABASE_URL not found in environment');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('🔍 Checking production database...');
|
||||
console.log('📍 Connection:', connectionString.includes('neon.tech') ? 'Neon (Production)' : 'localhost');
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
|
||||
async function checkDatabase() {
|
||||
try {
|
||||
// Test connection
|
||||
console.log('\n1️⃣ Testing database connection...');
|
||||
await pool.query('SELECT NOW()');
|
||||
console.log('✅ Database connection successful');
|
||||
|
||||
// Check tables exist
|
||||
console.log('\n2️⃣ Checking tables...');
|
||||
const tablesResult = await pool.query(`
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = 'public'
|
||||
ORDER BY table_name
|
||||
`);
|
||||
console.log(`✅ Found ${tablesResult.rows.length} tables:`, tablesResult.rows.map(r => r.table_name).join(', '));
|
||||
|
||||
// Check churches
|
||||
console.log('\n3️⃣ Checking churches...');
|
||||
const churchCount = await pool.query('SELECT COUNT(*) FROM "churches"');
|
||||
console.log(`📊 Churches: ${churchCount.rows[0].count}`);
|
||||
|
||||
if (parseInt(churchCount.rows[0].count) > 0) {
|
||||
const sampleChurch = await pool.query('SELECT id, name, city, state, latitude, longitude FROM "churches" LIMIT 1');
|
||||
console.log('📍 Sample church:', sampleChurch.rows[0]);
|
||||
} else {
|
||||
console.log('⚠️ No churches found in database!');
|
||||
}
|
||||
|
||||
// Check mass schedules
|
||||
console.log('\n4️⃣ Checking mass schedules...');
|
||||
const massCount = await pool.query('SELECT COUNT(*) FROM "mass_schedules"');
|
||||
console.log(`📊 Mass schedules: ${massCount.rows[0].count}`);
|
||||
|
||||
// Check liturgical days
|
||||
console.log('\n5️⃣ Checking liturgical days...');
|
||||
const liturgicalCount = await pool.query('SELECT COUNT(*) FROM "liturgical_days"');
|
||||
console.log(`📊 Liturgical days: ${liturgicalCount.rows[0].count}`);
|
||||
|
||||
// Check today's liturgical data
|
||||
const today = new Date().toISOString().split('T')[0];
|
||||
const todayData = await pool.query(
|
||||
'SELECT * FROM "liturgical_days" WHERE date = $1',
|
||||
[today]
|
||||
);
|
||||
if (todayData.rows.length > 0) {
|
||||
console.log(`✅ Today's liturgical data exists:`, todayData.rows[0].season);
|
||||
} else {
|
||||
console.log(`⚠️ No liturgical data for today (${today})`);
|
||||
}
|
||||
|
||||
console.log('\n✨ Database check complete!\n');
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkDatabase();
|
||||
164
scripts/debug/check-scraper-status.ts
Normal file
164
scripts/debug/check-scraper-status.ts
Normal file
@@ -0,0 +1,164 @@
|
||||
import { config } from 'dotenv';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
|
||||
// Load .env.local first, then .env
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
throw new Error('DATABASE_URL environment variable is not set');
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function checkScraperStatus() {
|
||||
try {
|
||||
console.log('Checking mass schedule scraper status...\n');
|
||||
|
||||
// Overall church stats
|
||||
const totalChurches = await prisma.church.count();
|
||||
|
||||
const churchesWithWebsites = await prisma.church.count({
|
||||
where: {
|
||||
OR: [
|
||||
{ website: { not: null } },
|
||||
{ massScheduleUrl: { not: null } },
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
const churchesScraped = await prisma.church.count({
|
||||
where: { lastScrapedAt: { not: null } },
|
||||
});
|
||||
|
||||
// Mass schedule stats
|
||||
const totalMassSchedules = await prisma.massSchedule.count();
|
||||
|
||||
const churchesWithSchedules = await prisma.church.count({
|
||||
where: {
|
||||
massSchedules: {
|
||||
some: {},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Recently scraped (last 7 days)
|
||||
const weekAgo = new Date();
|
||||
weekAgo.setDate(weekAgo.getDate() - 7);
|
||||
|
||||
const recentlyScraped = await prisma.church.count({
|
||||
where: {
|
||||
lastScrapedAt: { gte: weekAgo },
|
||||
},
|
||||
});
|
||||
|
||||
// Get scraper sources
|
||||
const bySource = await prisma.church.groupBy({
|
||||
by: ['source'],
|
||||
_count: {
|
||||
id: true,
|
||||
},
|
||||
});
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('CHURCH DATA SOURCES');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
bySource.forEach((source) => {
|
||||
const percent = ((source._count.id / totalChurches) * 100).toFixed(1);
|
||||
console.log(`${source.source.padEnd(12)} | ${String(source._count.id).padStart(7)} churches (${percent}%)`);
|
||||
});
|
||||
console.log('');
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('MASS SCHEDULE SCRAPING STATUS');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(`Total churches: ${totalChurches.toLocaleString()}`);
|
||||
console.log(`Churches with websites: ${churchesWithWebsites.toLocaleString()} (${((churchesWithWebsites / totalChurches) * 100).toFixed(1)}%)`);
|
||||
console.log(`Churches ever scraped: ${churchesScraped.toLocaleString()} (${((churchesScraped / totalChurches) * 100).toFixed(1)}%)`);
|
||||
console.log(`Churches with mass schedules: ${churchesWithSchedules.toLocaleString()} (${((churchesWithSchedules / totalChurches) * 100).toFixed(1)}%)`);
|
||||
console.log(`Total mass schedules: ${totalMassSchedules.toLocaleString()}`);
|
||||
console.log('');
|
||||
console.log(`Scraped in last 7 days: ${recentlyScraped.toLocaleString()}`);
|
||||
console.log('');
|
||||
|
||||
// Average schedules per church
|
||||
if (churchesWithSchedules > 0) {
|
||||
const avgSchedules = totalMassSchedules / churchesWithSchedules;
|
||||
console.log(`Average schedules per church: ${avgSchedules.toFixed(1)} masses/week`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Get sample of recently scraped churches
|
||||
const recentSample = await prisma.church.findMany({
|
||||
where: {
|
||||
lastScrapedAt: { not: null },
|
||||
},
|
||||
select: {
|
||||
name: true,
|
||||
city: true,
|
||||
state: true,
|
||||
country: true,
|
||||
lastScrapedAt: true,
|
||||
website: true,
|
||||
source: true,
|
||||
_count: {
|
||||
select: {
|
||||
massSchedules: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
orderBy: { lastScrapedAt: 'desc' },
|
||||
take: 10,
|
||||
});
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('RECENTLY SCRAPED CHURCHES (Last 10)');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
if (recentSample.length === 0) {
|
||||
console.log('No churches have been scraped yet.');
|
||||
} else {
|
||||
recentSample.forEach((church, index) => {
|
||||
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
|
||||
console.log(`${index + 1}. ${church.name} (${location})`);
|
||||
console.log(` Source: ${church.source}`);
|
||||
console.log(` Website: ${church.website || 'None'}`);
|
||||
console.log(` Last scraped: ${church.lastScrapedAt?.toLocaleString() || 'Never'}`);
|
||||
console.log(` Mass schedules: ${church._count.massSchedules}`);
|
||||
console.log('');
|
||||
});
|
||||
}
|
||||
|
||||
// Churches ready to scrape (have website, not scraped)
|
||||
const readyToScrape = await prisma.church.count({
|
||||
where: {
|
||||
OR: [
|
||||
{ website: { not: null } },
|
||||
{ massScheduleUrl: { not: null } },
|
||||
],
|
||||
lastScrapedAt: null,
|
||||
},
|
||||
});
|
||||
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log('SCRAPING POTENTIAL');
|
||||
console.log('═══════════════════════════════════════════════════════════════');
|
||||
console.log(`Churches ready to scrape: ${readyToScrape.toLocaleString()}`);
|
||||
console.log(` (have website, never scraped)`);
|
||||
console.log('');
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
checkScraperStatus();
|
||||
47
scripts/debug/compare-schemas.ts
Normal file
47
scripts/debug/compare-schemas.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
import { Pool } from 'pg';
|
||||
|
||||
async function getColumns(pool: Pool, table: string) {
|
||||
const result = await pool.query(
|
||||
`SELECT column_name, data_type FROM information_schema.columns WHERE table_name = $1 ORDER BY ordinal_position`,
|
||||
[table]
|
||||
);
|
||||
return result.rows;
|
||||
}
|
||||
|
||||
async function run() {
|
||||
const nas = new Pool({ connectionString: 'postgresql://postgres:postgres@192.168.0.145:5434/nearestmass' });
|
||||
const neon = new Pool({
|
||||
connectionString: 'postgresql://neondb_owner:npg_sX8dxFg9KZIR@ep-plain-sky-ah15xa97-pooler.c-3.us-east-1.aws.neon.tech/neondb?sslmode=require',
|
||||
ssl: { rejectUnauthorized: false },
|
||||
});
|
||||
|
||||
for (const table of ['churches', 'mass_schedules', 'confession_schedules', 'adoration_schedules']) {
|
||||
const nasCols = await getColumns(nas, table);
|
||||
const neonCols = await getColumns(neon, table);
|
||||
|
||||
const nasNames = new Set(nasCols.map((c) => c.column_name));
|
||||
const neonNames = new Set(neonCols.map((c) => c.column_name));
|
||||
|
||||
const onlyNas = nasCols.filter((c) => !neonNames.has(c.column_name));
|
||||
const onlyNeon = neonCols.filter((c) => !nasNames.has(c.column_name));
|
||||
|
||||
if (onlyNas.length > 0 || onlyNeon.length > 0) {
|
||||
console.log(`\n=== ${table} ===`);
|
||||
if (onlyNas.length) {
|
||||
console.log(' NAS only:');
|
||||
for (const c of onlyNas) console.log(` - ${c.column_name} (${c.data_type})`);
|
||||
}
|
||||
if (onlyNeon.length) {
|
||||
console.log(' Neon only:');
|
||||
for (const c of onlyNeon) console.log(` - ${c.column_name} (${c.data_type})`);
|
||||
}
|
||||
} else {
|
||||
console.log(`\n=== ${table} === (schemas match)`);
|
||||
}
|
||||
}
|
||||
|
||||
await nas.end();
|
||||
await neon.end();
|
||||
}
|
||||
|
||||
run();
|
||||
48
scripts/debug/data-overview.ts
Normal file
48
scripts/debug/data-overview.ts
Normal file
@@ -0,0 +1,48 @@
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
async function main() {
|
||||
const c = await pool.connect();
|
||||
|
||||
const total = await c.query('SELECT count(*) FROM "Church"');
|
||||
console.log('\n=== DATABASE OVERVIEW ===');
|
||||
console.log('Churches total:', Number(total.rows[0].count).toLocaleString());
|
||||
|
||||
const withWebsite = await c.query('SELECT count(*) FROM "Church" WHERE website IS NOT NULL');
|
||||
console.log('With website:', Number(withWebsite.rows[0].count).toLocaleString());
|
||||
|
||||
const withSchedules = await c.query('SELECT count(DISTINCT "churchId") FROM "MassSchedule"');
|
||||
console.log('With mass schedules:', Number(withSchedules.rows[0].count).toLocaleString());
|
||||
|
||||
const enrichedGoogle = await c.query('SELECT count(*) FROM "Church" WHERE "googlePlaceId" IS NOT NULL');
|
||||
console.log('Google Places enriched:', Number(enrichedGoogle.rows[0].count).toLocaleString());
|
||||
|
||||
const totalSchedules = await c.query('SELECT count(*) FROM "MassSchedule"');
|
||||
console.log('Total mass schedules:', Number(totalSchedules.rows[0].count).toLocaleString());
|
||||
|
||||
const countries = await c.query('SELECT country, count(*) as cnt FROM "Church" GROUP BY country ORDER BY cnt DESC LIMIT 15');
|
||||
console.log('\n=== TOP COUNTRIES ===');
|
||||
for (const r of countries.rows) console.log(' ' + (r.country || '(null)') + ':', Number(r.cnt).toLocaleString());
|
||||
|
||||
const sources = await c.query('SELECT source, count(*) as cnt FROM "Church" GROUP BY source ORDER BY cnt DESC LIMIT 10');
|
||||
console.log('\n=== CHURCH SOURCES ===');
|
||||
for (const r of sources.rows) console.log(' ' + (r.source || '(null)') + ':', Number(r.cnt).toLocaleString());
|
||||
|
||||
const lastScrape = await c.query('SELECT "lastScrapedAt" FROM "Church" WHERE "lastScrapedAt" IS NOT NULL ORDER BY "lastScrapedAt" DESC LIMIT 1');
|
||||
console.log('\n=== LAST SCRAPE ===');
|
||||
console.log(lastScrape.rows[0]?.lastScrapedAt || 'No scrapes yet');
|
||||
|
||||
const jobs = await c.query('SELECT status, count(*) as cnt FROM "ScrapeJob" GROUP BY status ORDER BY cnt DESC');
|
||||
console.log('\n=== JOB STATUS ===');
|
||||
for (const r of jobs.rows) console.log(' ' + r.status + ':', Number(r.cnt).toLocaleString());
|
||||
|
||||
const schedulesByLang = await c.query('SELECT language, count(*) as cnt FROM "MassSchedule" GROUP BY language ORDER BY cnt DESC LIMIT 10');
|
||||
console.log('\n=== SCHEDULES BY LANGUAGE ===');
|
||||
for (const r of schedulesByLang.rows) console.log(' ' + (r.language || '(null)') + ':', Number(r.cnt).toLocaleString());
|
||||
|
||||
c.release();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(e => { console.error(e.message); process.exit(1); });
|
||||
58
scripts/debug/debug-french-page.ts
Normal file
58
scripts/debug/debug-french-page.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug a specific French page to see why scraping failed
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function debugPage() {
|
||||
const url = 'https://www.chemin-neuf.fr/'; // Last failed church
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules found: ${result.schedules.length}`);
|
||||
if (result.error) console.log(`Error: ${result.error}`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
console.log('\n=== Page Text Sample (first 2000 chars) ===');
|
||||
console.log(text.substring(0, 2000));
|
||||
console.log('\n');
|
||||
|
||||
// Check for French day names
|
||||
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
|
||||
console.log('=== French day names found ===');
|
||||
for (const day of frenchDays) {
|
||||
if (text.includes(day)) {
|
||||
console.log(`✓ Found: ${day}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for time patterns
|
||||
console.log('\n=== Time patterns (sample) ===');
|
||||
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:AM|PM|am|pm|Uhr|uur|h)?/g;
|
||||
const times = text.match(timeRegex);
|
||||
if (times) {
|
||||
console.log(`Found ${times.length} time-like patterns:`);
|
||||
console.log(times.slice(0, 20).join(', '));
|
||||
} else {
|
||||
console.log('No time patterns found');
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPage().catch(console.error);
|
||||
65
scripts/debug/debug-german-duplicates.ts
Normal file
65
scripts/debug/debug-german-duplicates.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug why German church has duplicate schedules
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
// Temporarily patch GenericScraper to log sections
|
||||
const originalParse = GenericScraper.prototype['parseSchedules'];
|
||||
GenericScraper.prototype['parseSchedules'] = function(html: string) {
|
||||
const text = html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Call findScheduleSections and log result
|
||||
const sections = this['findScheduleSections'](text);
|
||||
|
||||
console.log('\n=== Sections found ===\n');
|
||||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
sections.forEach((section: any, i: number) => {
|
||||
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
|
||||
console.log(` Text preview: "${section.text.substring(0, 100)}..."`);
|
||||
});
|
||||
console.log(`\nTotal sections: ${sections.length}\n`);
|
||||
|
||||
// Continue with normal processing
|
||||
const result = originalParse.call(this, html);
|
||||
|
||||
console.log(`\n=== Extracted times per section ===\n`);
|
||||
const schedsByDay: Record<number, typeof result> = {};
|
||||
for (const sched of result) {
|
||||
if (!schedsByDay[sched.dayOfWeek]) schedsByDay[sched.dayOfWeek] = [];
|
||||
schedsByDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (schedsByDay[i]) {
|
||||
console.log(`${dayNames[i]}: ${schedsByDay[i].map(s => s.time).join(', ')}`);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
async function testGerman() {
|
||||
const url = 'https://www.alterpeter.de/';
|
||||
console.log(`Testing: ${url}`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`\n=== Final Result ===`);
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Total schedules: ${result.schedules.length}`);
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
testGerman().catch(console.error);
|
||||
44
scripts/debug/debug-masstimes.ts
Normal file
44
scripts/debug/debug-masstimes.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import { chromium } from 'playwright';
|
||||
|
||||
async function main() {
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const page = await browser.newPage();
|
||||
|
||||
const url = 'https://masstimes.org/search?lat=32.7765&lng=-79.9311&type=parish';
|
||||
console.log('Loading:', url);
|
||||
|
||||
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
|
||||
|
||||
// Wait for Angular to render
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
// Take screenshot
|
||||
await page.screenshot({ path: '/tmp/masstimes-debug.png', fullPage: true });
|
||||
console.log('Screenshot saved to /tmp/masstimes-debug.png');
|
||||
|
||||
// Get page HTML
|
||||
const html = await page.content();
|
||||
console.log('\n--- PAGE HTML (first 5000 chars) ---\n');
|
||||
console.log(html.substring(0, 5000));
|
||||
|
||||
// Try to find any visible text that looks like church names
|
||||
const visibleText = await page.evaluate(() => {
|
||||
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
|
||||
const texts: string[] = [];
|
||||
let node;
|
||||
while ((node = walker.nextNode())) {
|
||||
const text = node.textContent?.trim();
|
||||
if (text && text.length > 10 && text.length < 100) {
|
||||
texts.push(text);
|
||||
}
|
||||
}
|
||||
return texts.slice(0, 50);
|
||||
});
|
||||
|
||||
console.log('\n--- VISIBLE TEXT SNIPPETS ---\n');
|
||||
visibleText.forEach((t, i) => console.log(`${i + 1}. ${t}`));
|
||||
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
74
scripts/debug/debug-paroquia-paz.ts
Normal file
74
scripts/debug/debug-paroquia-paz.ts
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Deep dive into Paróquia da Paz parsing bug
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function debugPaz() {
|
||||
const url = 'https://www.paroquiadapaz.org.br/';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('BR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find where days appear
|
||||
console.log('=== Finding day + time patterns ===\n');
|
||||
|
||||
const days = ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'];
|
||||
|
||||
for (const day of days) {
|
||||
const dayIndex = text.indexOf(day);
|
||||
if (dayIndex !== -1) {
|
||||
// Show context around the day (100 chars before and 200 after)
|
||||
const before = Math.max(0, dayIndex - 100);
|
||||
const after = Math.min(text.length, dayIndex + 200);
|
||||
const snippet = text.substring(before, after);
|
||||
|
||||
console.log(`${day.toUpperCase()}:`);
|
||||
console.log(` Position: ${dayIndex}`);
|
||||
console.log(` Context: ...${snippet}...`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
// Check for "h" time format specifically
|
||||
console.log('\n=== Checking "h" time format ===');
|
||||
const hTimeRegex = /(\d{1,2})h(\d{2})?/g;
|
||||
const hTimes = text.match(hTimeRegex);
|
||||
if (hTimes) {
|
||||
console.log(`Found ${hTimes.length} "h" format times:`);
|
||||
console.log(hTimes.slice(0, 30).join(', '));
|
||||
}
|
||||
|
||||
// Look for schedule structure
|
||||
console.log('\n=== Looking for schedule structure ===');
|
||||
const scheduleKeywords = ['horário', 'horario', 'missa', 'missas', 'santa missa'];
|
||||
for (const keyword of scheduleKeywords) {
|
||||
const index = text.indexOf(keyword);
|
||||
if (index !== -1) {
|
||||
const snippet = text.substring(index, Math.min(text.length, index + 500));
|
||||
console.log(`\nFound "${keyword}" at position ${index}:`);
|
||||
console.log(snippet.substring(0, 300));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPaz().catch(console.error);
|
||||
150
scripts/debug/debug-parsing-bugs.ts
Normal file
150
scripts/debug/debug-parsing-bugs.ts
Normal file
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug the 5 parsing bugs identified in top 5 test
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
// The churches with parsing bugs
|
||||
const BUG_CHURCHES = [
|
||||
{ name: 'St. Marien', country: 'DE', searchTerm: 'St. Marien' },
|
||||
{ name: 'Santuario de Manalagua', country: 'ES', searchTerm: 'Santuario de Manalagua' },
|
||||
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', searchTerm: 'Najświętszego Serca Pana Jez' },
|
||||
{ name: 'Paróquia de Nossa Senhora do Desterro', country: 'BR', searchTerm: 'Nossa Senhora do Desterro' },
|
||||
{ name: 'Paróquia da Paz', country: 'BR', searchTerm: 'Paróquia da Paz' },
|
||||
];
|
||||
|
||||
async function debugBugs() {
|
||||
console.log('Debugging parsing bugs...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
for (const bug of BUG_CHURCHES) {
|
||||
console.log('═'.repeat(80));
|
||||
console.log(`BUG: ${bug.name} (${bug.country})`);
|
||||
console.log('═'.repeat(80));
|
||||
|
||||
const church = await prisma.church.findFirst({
|
||||
where: {
|
||||
country: bug.country,
|
||||
name: { contains: bug.searchTerm },
|
||||
website: { not: null },
|
||||
},
|
||||
});
|
||||
|
||||
if (!church) {
|
||||
console.log(`❌ Church not found in database\n`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`Church: ${church.name}`);
|
||||
console.log(`URL: ${church.website}\n`);
|
||||
|
||||
scraper.setCountry(bug.country);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules found: ${result.schedules.length}`);
|
||||
if (result.error) console.log(`Error: ${result.error}`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
console.log('\n--- Text Sample (first 1000 chars) ---');
|
||||
console.log(text.substring(0, 1000));
|
||||
|
||||
// Check for day names
|
||||
console.log('\n--- Day Names Found ---');
|
||||
const dayPatterns: Record<string, string[]> = {
|
||||
DE: ['sonntag', 'montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag'],
|
||||
ES: ['domingo', 'lunes', 'martes', 'miércoles', 'miercoles', 'jueves', 'viernes', 'sábado', 'sabado'],
|
||||
PL: ['niedziela', 'poniedziałek', 'poniedzialek', 'wtorek', 'środa', 'sroda', 'czwartek', 'piątek', 'piatek', 'sobota'],
|
||||
BR: ['domingo', 'segunda', 'terça', 'terca', 'quarta', 'quinta', 'sexta', 'sábado', 'sabado'],
|
||||
};
|
||||
|
||||
const days = dayPatterns[bug.country] || [];
|
||||
const foundDays: string[] = [];
|
||||
for (const day of days) {
|
||||
if (text.includes(day)) {
|
||||
foundDays.push(day);
|
||||
}
|
||||
}
|
||||
console.log(`Found: ${foundDays.join(', ') || 'none'}`);
|
||||
|
||||
// Check for time patterns
|
||||
console.log('\n--- Time Patterns Found ---');
|
||||
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h|uhr)?/gi;
|
||||
const times = text.match(timeRegex);
|
||||
if (times) {
|
||||
const uniqueTimes = [...new Set(times)].slice(0, 20);
|
||||
console.log(`Found ${times.length} time patterns (showing first 20 unique):`);
|
||||
console.log(uniqueTimes.join(', '));
|
||||
} else {
|
||||
console.log('No time patterns found');
|
||||
}
|
||||
|
||||
// Look for specific mass schedule keywords
|
||||
console.log('\n--- Mass Schedule Keywords ---');
|
||||
const keywords: Record<string, string[]> = {
|
||||
DE: ['gottesdienst', 'messe', 'heilige messe', 'messzeiten'],
|
||||
ES: ['misa', 'horario', 'eucaristía', 'eucaristia'],
|
||||
PL: ['msza', 'msze', 'nabożeństwo', 'nabozenstwo'],
|
||||
BR: ['missa', 'horário', 'horario', 'eucaristia'],
|
||||
};
|
||||
|
||||
const countryKeywords = keywords[bug.country] || [];
|
||||
const foundKeywords: string[] = [];
|
||||
for (const keyword of countryKeywords) {
|
||||
if (text.includes(keyword)) {
|
||||
foundKeywords.push(keyword);
|
||||
}
|
||||
}
|
||||
console.log(`Found: ${foundKeywords.join(', ') || 'none'}`);
|
||||
|
||||
// Look for specific problematic patterns
|
||||
console.log('\n--- Looking for edge cases ---');
|
||||
|
||||
// Check if times and days are separated (not in same section)
|
||||
const hasTimeBeforeDays = text.indexOf(foundDays[0] || 'zzz') > text.indexOf((times || [])[0] || 'aaa');
|
||||
console.log(`Times come before days: ${hasTimeBeforeDays ? 'YES (potential issue)' : 'no'}`);
|
||||
|
||||
// Check for table structures
|
||||
const hasTables = text.includes('colspan') || text.includes('rowspan') || (text.match(/\s+\|\s+/g)?.length || 0) > 5;
|
||||
console.log(`Likely table format: ${hasTables ? 'YES (may need special handling)' : 'no'}`);
|
||||
|
||||
// Check for multiple languages on same page
|
||||
const hasMultiLang = (text.match(/english|español|espanol|portuguese|português|portugues|deutsch|polski/gi)?.length || 0) > 1;
|
||||
console.log(`Multiple languages: ${hasMultiLang ? 'YES (may confuse parser)' : 'no'}`);
|
||||
}
|
||||
|
||||
console.log('\n');
|
||||
} catch (err: any) {
|
||||
console.log(`❌ ERROR: ${err.message}\n`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
debugBugs().catch(console.error);
|
||||
98
scripts/debug/debug-paz-full-flow.ts
Normal file
98
scripts/debug/debug-paz-full-flow.ts
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug the full parsing flow with section detection
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
async function debugFullFlow() {
|
||||
const url = 'https://www.paroquiadapaz.org.br/';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('BR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
if (!result.rawHtml) {
|
||||
console.log('No HTML received');
|
||||
await scraper.close();
|
||||
return;
|
||||
}
|
||||
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find the schedule section
|
||||
const scheduleIndex = text.indexOf('segundas, terças');
|
||||
if (scheduleIndex === -1) {
|
||||
console.log('Schedule text not found!');
|
||||
await scraper.close();
|
||||
return;
|
||||
}
|
||||
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
|
||||
console.log('Schedule snippet from actual HTML:');
|
||||
console.log(snippet);
|
||||
console.log('\n');
|
||||
|
||||
// Now test section matching on actual text
|
||||
const dayConfigs = getDayNamesForCountry('BR');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== Testing sábados and domingos matches ===\n');
|
||||
|
||||
// Test sábados
|
||||
const sabadosRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])sábados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const sabadosMatch = snippet.match(sabadosRegex);
|
||||
console.log('sábados match:', sabadosMatch ? `Found: "${sabadosMatch[1].substring(0, 50)}"` : 'Not found');
|
||||
|
||||
// Test sabados (no accent)
|
||||
const sabadosRegex2 = new RegExp(
|
||||
`(?:^|\\s|[,;:])sabados[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const sabadosMatch2 = snippet.match(sabadosRegex2);
|
||||
console.log('sabados match:', sabadosMatch2 ? `Found: "${sabadosMatch2[1].substring(0, 50)}"` : 'Not found');
|
||||
|
||||
// Test domingos
|
||||
const domingosRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])domingos[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const domingosMatch = snippet.match(domingosRegex);
|
||||
console.log('domingos match:', domingosMatch ? `Found: "${domingosMatch[1].substring(0, 50)}"` : 'Not found');
|
||||
|
||||
console.log('\n=== Final parsed schedules ===\n');
|
||||
console.log(`Total: ${result.schedules.length}`);
|
||||
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
console.log(`${dayNames[i]}: ${byDay[i].length} schedules`);
|
||||
} else {
|
||||
console.log(`${dayNames[i]}: 0 schedules ❌`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugFullFlow().catch(console.error);
|
||||
56
scripts/debug/debug-paz-sections.ts
Normal file
56
scripts/debug/debug-paz-sections.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug which sections are being found
|
||||
*/
|
||||
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
// Simulate the exact text from the page
|
||||
const scheduleText = `
|
||||
horário das missas igreja matriz de santo antônio
|
||||
segundas, terças, quartas e sextas-feiras: 16h e 18h.
|
||||
quintas-feiras: 16h e 19h (adoração ao santíssimo – 18h).
|
||||
sábados: 8h, 16h e 18h.
|
||||
domingos: 8h, 11h, 16h, 18h e 20h.
|
||||
`.toLowerCase();
|
||||
|
||||
console.log('Text to parse:');
|
||||
console.log(scheduleText);
|
||||
console.log('');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('BR');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== COMMA-SEPARATED GROUP MATCHING ===\n');
|
||||
|
||||
const dayGroupRegex = new RegExp(
|
||||
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
|
||||
'gi'
|
||||
);
|
||||
|
||||
let groupMatch;
|
||||
let matchCount = 0;
|
||||
while ((groupMatch = dayGroupRegex.exec(scheduleText)) !== null) {
|
||||
matchCount++;
|
||||
console.log(`Match #${matchCount}:`);
|
||||
console.log(` Day group: "${groupMatch[1]}"`);
|
||||
console.log(` Time text: "${groupMatch[2]}"`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
console.log('=== INDIVIDUAL DAY MATCHING ===\n');
|
||||
|
||||
for (const [dayName, dayIndex] of Object.entries(dayPatterns)) {
|
||||
const escaped = dayName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(
|
||||
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
const match = scheduleText.match(regex);
|
||||
if (match) {
|
||||
console.log(`Found ${dayName} (day ${dayIndex}):`);
|
||||
console.log(` Time text: "${match[1].substring(0, 100)}"`);
|
||||
}
|
||||
}
|
||||
85
scripts/debug/debug-paz-with-logging.ts
Normal file
85
scripts/debug/debug-paz-with-logging.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug Paróquia da Paz with added logging
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
async function debugPazWithLogging() {
|
||||
const url = 'https://www.paroquiadapaz.org.br/';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('BR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Test the regex pattern manually
|
||||
console.log('=== Testing comma-separated day grouping regex ===\n');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('BR');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('Day patterns:', Object.keys(dayPatterns).join(', '));
|
||||
console.log('');
|
||||
|
||||
// The exact regex from the code
|
||||
const dayGroupRegex = new RegExp(
|
||||
`((?:${allDayNamesPattern})(?:[,\\s]+(?:e|and|et|und|y)?\\s*(?:${allDayNamesPattern}))+)[:\\s]+([^]*?)(?=(?:${allDayNamesPattern})|$)`,
|
||||
'gi'
|
||||
);
|
||||
|
||||
console.log('Regex pattern:', dayGroupRegex.source.substring(0, 200) + '...\n');
|
||||
|
||||
let groupMatch;
|
||||
let matchCount = 0;
|
||||
while ((groupMatch = dayGroupRegex.exec(text)) !== null) {
|
||||
matchCount++;
|
||||
console.log(`Match #${matchCount}:`);
|
||||
console.log(` Full match: "${groupMatch[0].substring(0, 100)}"`);
|
||||
console.log(` Day group: "${groupMatch[1]}"`);
|
||||
console.log(` Time text: "${groupMatch[2].substring(0, 50)}"`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (matchCount === 0) {
|
||||
console.log('No matches found!\n');
|
||||
|
||||
// Try to find the schedule text manually
|
||||
const scheduleIndex = text.indexOf('segundas, terças');
|
||||
if (scheduleIndex !== -1) {
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 300);
|
||||
console.log('Found schedule text at position', scheduleIndex);
|
||||
console.log('Snippet:', snippet);
|
||||
console.log('');
|
||||
|
||||
// Test if individual day names are matching
|
||||
console.log('Testing individual day name matches in snippet:');
|
||||
for (const dayName of sortedDayNames.slice(0, 10)) {
|
||||
if (snippet.includes(dayName)) {
|
||||
console.log(` ✓ Found: ${dayName}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPazWithLogging().catch(console.error);
|
||||
85
scripts/debug/debug-polish-church.ts
Normal file
85
scripts/debug/debug-polish-church.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug Polish church in detail
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
async function debugPolish() {
|
||||
const url = 'http://parafialubojna.pl';
|
||||
console.log(`Debugging: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('PL');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules found: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find the schedule section
|
||||
const scheduleIndex = text.indexOf('msze święte') || text.indexOf('msze swiete');
|
||||
if (scheduleIndex !== -1) {
|
||||
const snippet = text.substring(scheduleIndex, scheduleIndex + 500);
|
||||
console.log('Schedule section:');
|
||||
console.log(snippet);
|
||||
console.log('\n');
|
||||
|
||||
// Test all time pattern matches
|
||||
console.log('=== Testing time pattern matches ===\n');
|
||||
|
||||
// Space separator pattern
|
||||
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||||
const spaceMatches = snippet.match(spacePattern);
|
||||
console.log('Space-separated times (8 00, 9 30):');
|
||||
console.log(spaceMatches ? spaceMatches.join(', ') : 'none');
|
||||
console.log('');
|
||||
|
||||
// Colon pattern
|
||||
const colonPattern = /\d{1,2}:\d{2}/g;
|
||||
const colonMatches = snippet.match(colonPattern);
|
||||
console.log('Colon times (8:00, 9:30):');
|
||||
console.log(colonMatches ? colonMatches.join(', ') : 'none');
|
||||
console.log('');
|
||||
|
||||
// Polish day names
|
||||
console.log('=== Polish day names in snippet ===\n');
|
||||
const dayConfigs = getDayNamesForCountry('PL');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
|
||||
for (const [dayName, dayNum] of Object.entries(dayPatterns)) {
|
||||
if (snippet.includes(dayName)) {
|
||||
console.log(`Found: ${dayName} (day ${dayNum})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n=== Parsed schedules ===\n');
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
console.log(`${dayNames[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
debugPolish().catch(console.error);
|
||||
79
scripts/debug/debug-polish-sunday-monday.ts
Normal file
79
scripts/debug/debug-polish-sunday-monday.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Debug why Sunday and Monday aren't parsing for Polish church
|
||||
*/
|
||||
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
// Exact schedule text from website
|
||||
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
|
||||
|
||||
console.log('Text to parse:');
|
||||
console.log(text);
|
||||
console.log('\n');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('PL');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== Testing niedziela (Sunday) ===\n');
|
||||
|
||||
// Current regex pattern
|
||||
const niedziela = 'niedziela';
|
||||
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(
|
||||
`(?:^|\\s|[,;:])${escaped}(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const match = text.match(regex);
|
||||
if (match) {
|
||||
console.log(`✓ Matched!`);
|
||||
console.log(` Full match: "${match[0].substring(0, 100)}"`);
|
||||
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
|
||||
console.log('');
|
||||
|
||||
// Check if times can be extracted
|
||||
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||||
const times = match[1].match(spacePattern);
|
||||
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ NOT matched`);
|
||||
}
|
||||
|
||||
console.log('\n=== Testing poniedziałek (Monday) ===\n');
|
||||
|
||||
const ponieRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])poniedziałek(?:(?:[^:]{1,50})?:|\\s+)([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const ponieMatch = text.match(ponieRegex);
|
||||
if (ponieMatch) {
|
||||
console.log(`✓ Matched!`);
|
||||
console.log(` Full match: "${ponieMatch[0].substring(0, 100)}"`);
|
||||
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
|
||||
console.log('');
|
||||
|
||||
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
|
||||
console.log(` Times found: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ NOT matched`);
|
||||
}
|
||||
|
||||
console.log('\n=== Analyzing why niedziela might fail ===\n');
|
||||
|
||||
// The issue might be "niedziela i uroczystości:" - the phrase is long
|
||||
// Check if the lookahead is hitting "uroczystości" before getting to the times
|
||||
const niedziela_index = text.indexOf('niedziela');
|
||||
const next_day_index = Math.min(
|
||||
...sortedDayNames
|
||||
.filter(d => d !== 'niedziela')
|
||||
.map(d => text.indexOf(d, niedziela_index))
|
||||
.filter(i => i > 0)
|
||||
);
|
||||
|
||||
console.log(`niedziela position: ${niedziela_index}`);
|
||||
console.log(`Next day name position: ${next_day_index}`);
|
||||
console.log(`Text between: "${text.substring(niedziela_index, next_day_index)}"`);
|
||||
44
scripts/debug/debug-thursday-context.ts
Normal file
44
scripts/debug/debug-thursday-context.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find "montag bis donnerstag" pattern
|
||||
const pattern = /montag[^]*?bis[^]*?donnerstag/gi;
|
||||
const matches = [...text.matchAll(pattern)];
|
||||
|
||||
console.log(`Found ${matches.length} instances of "montag bis donnerstag":\n`);
|
||||
|
||||
for (let i = 0; i < matches.length; i++) {
|
||||
const match = matches[i];
|
||||
const matchIndex = match.index || 0;
|
||||
const contextBefore = text.substring(Math.max(0, matchIndex - 150), matchIndex);
|
||||
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 250));
|
||||
|
||||
console.log(`=== Instance ${i + 1} ===`);
|
||||
console.log(`Position: ${matchIndex}`);
|
||||
console.log(`\nContext BEFORE (150 chars):`);
|
||||
console.log(`"${contextBefore}"`);
|
||||
console.log(`\nContext AFTER (250 chars):`);
|
||||
console.log(`"${contextAfter}"`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
45
scripts/debug/debug-zero-time.ts
Normal file
45
scripts/debug/debug-zero-time.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find all instances of "00 uhr" pattern
|
||||
let idx = 0;
|
||||
let count = 0;
|
||||
const pattern = /\b00\s*uhr/g;
|
||||
let match;
|
||||
|
||||
console.log('Looking for "00 uhr" patterns:\n');
|
||||
|
||||
while ((match = pattern.exec(text)) !== null) {
|
||||
count++;
|
||||
const matchIndex = match.index;
|
||||
const contextBefore = text.substring(Math.max(0, matchIndex - 50), matchIndex);
|
||||
const contextAfter = text.substring(matchIndex, Math.min(text.length, matchIndex + 100));
|
||||
|
||||
console.log(`=== Occurrence ${count} at position ${matchIndex} ===`);
|
||||
console.log(`BEFORE: "...${contextBefore}"`);
|
||||
console.log(`MATCH + AFTER: "${contextAfter}..."`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
console.log(`Total "00 uhr" occurrences: ${count}`);
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
37
scripts/debug/export-de-from-neon.ts
Normal file
37
scripts/debug/export-de-from-neon.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { config } from 'dotenv';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import fs from 'fs/promises';
|
||||
|
||||
config({ path: '.env.local' });
|
||||
|
||||
async function main() {
|
||||
console.log('📦 Exporting Germany from Neon...');
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
await prisma.$connect();
|
||||
|
||||
const churches = await prisma.churches.findMany({
|
||||
where: { country: 'DE' },
|
||||
include: {
|
||||
massSchedules: true,
|
||||
confessionSchedules: true,
|
||||
adorationSchedules: true,
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Found ${churches.length} churches in Germany`);
|
||||
|
||||
await fs.writeFile('export-DE.json', JSON.stringify(churches, null, 2));
|
||||
console.log(`✅ Exported to export-DE.json`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
60
scripts/debug/export-from-nas.ts
Normal file
60
scripts/debug/export-from-nas.ts
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Export churches from NAS database to JSON
|
||||
* Run this ON THE NAS (uses DATABASE_URL from .env)
|
||||
*/
|
||||
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import fs from 'fs/promises';
|
||||
|
||||
async function main() {
|
||||
const country = process.argv[2] || 'PL';
|
||||
|
||||
console.log(`📦 Exporting ${country} data from database...`);
|
||||
console.log(`DATABASE_URL: ${process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
const prisma = new PrismaClient();
|
||||
|
||||
try {
|
||||
await prisma.$connect();
|
||||
console.log('✅ Connected to database');
|
||||
|
||||
// Export churches with all schedules
|
||||
const churches = await prisma.churches.findMany({
|
||||
where: { country },
|
||||
include: {
|
||||
massSchedules: true,
|
||||
confessionSchedules: true,
|
||||
adorationSchedules: true,
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Found ${churches.length} churches in ${country}`);
|
||||
|
||||
// Count schedules
|
||||
const massSchedules = churches.reduce((sum, c) => sum + (c.massSchedules?.length || 0), 0);
|
||||
const confessionSchedules = churches.reduce((sum, c) => sum + (c.confessionSchedules?.length || 0), 0);
|
||||
const adorationSchedules = churches.reduce((sum, c) => sum + (c.adorationSchedules?.length || 0), 0);
|
||||
|
||||
// Save to file
|
||||
const exportFile = `export-${country}.json`;
|
||||
await fs.writeFile(exportFile, JSON.stringify(churches, null, 2));
|
||||
|
||||
console.log(`\n✅ Exported to ${exportFile}`);
|
||||
console.log(` - ${churches.length} churches`);
|
||||
console.log(` - ${massSchedules} mass schedules`);
|
||||
console.log(` - ${confessionSchedules} confession schedules`);
|
||||
console.log(` - ${adorationSchedules} adoration schedules`);
|
||||
console.log(`\nDownload with:`);
|
||||
console.log(` scp albert@192.168.0.145:/volume1/docker/nearestmass/${exportFile} .`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Export failed:', error);
|
||||
await prisma.$disconnect();
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
230
scripts/debug/export-import-to-neon.ts
Normal file
230
scripts/debug/export-import-to-neon.ts
Normal file
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Export churches from local NAS database and import to Neon
|
||||
*/
|
||||
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
interface ExportStats {
|
||||
churches: number;
|
||||
massSchedules: number;
|
||||
confessionSchedules: number;
|
||||
adorationSchedules: number;
|
||||
}
|
||||
|
||||
async function exportFromNAS(country: string): Promise<ExportStats> {
|
||||
console.log(`📦 Exporting ${country} data from NAS...`);
|
||||
|
||||
// Set DATABASE_URL to NAS
|
||||
const originalUrl = process.env.DATABASE_URL;
|
||||
process.env.DATABASE_URL = 'postgresql://postgres:postgres@192.168.0.145:5432/nearestmass';
|
||||
|
||||
const nasPrisma = new PrismaClient();
|
||||
|
||||
try {
|
||||
await nasPrisma.$connect();
|
||||
console.log('✅ Connected to NAS database');
|
||||
|
||||
// Export churches with all schedules
|
||||
const churches = await nasPrisma.churches.findMany({
|
||||
where: { country },
|
||||
include: {
|
||||
massSchedules: true,
|
||||
confessionSchedules: true,
|
||||
adorationSchedules: true,
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Found ${churches.length} churches in ${country}`);
|
||||
|
||||
// Count schedules
|
||||
const stats: ExportStats = {
|
||||
churches: churches.length,
|
||||
massSchedules: churches.reduce((sum, c) => sum + (c.massSchedules?.length || 0), 0),
|
||||
confessionSchedules: churches.reduce((sum, c) => sum + (c.confessionSchedules?.length || 0), 0),
|
||||
adorationSchedules: churches.reduce((sum, c) => sum + (c.adorationSchedules?.length || 0), 0),
|
||||
};
|
||||
|
||||
// Save to file
|
||||
const exportFile = path.join(process.cwd(), `export-${country}.json`);
|
||||
await fs.writeFile(exportFile, JSON.stringify(churches, null, 2));
|
||||
console.log(`✅ Exported to ${exportFile}`);
|
||||
console.log(` - ${stats.churches} churches`);
|
||||
console.log(` - ${stats.massSchedules} mass schedules`);
|
||||
console.log(` - ${stats.confessionSchedules} confession schedules`);
|
||||
console.log(` - ${stats.adorationSchedules} adoration schedules`);
|
||||
|
||||
await nasPrisma.$disconnect();
|
||||
|
||||
// Restore original DATABASE_URL
|
||||
if (originalUrl) {
|
||||
process.env.DATABASE_URL = originalUrl;
|
||||
}
|
||||
|
||||
return stats;
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Export failed:', error);
|
||||
await nasPrisma.$disconnect();
|
||||
|
||||
// Restore original DATABASE_URL
|
||||
if (originalUrl) {
|
||||
process.env.DATABASE_URL = originalUrl;
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function importToNeon(country: string, dryRun: boolean = true): Promise<void> {
|
||||
console.log(`\n📤 Importing ${country} data to Neon...`);
|
||||
if (dryRun) {
|
||||
console.log('🔍 DRY RUN MODE - No data will be written');
|
||||
}
|
||||
|
||||
// Read export file
|
||||
const exportFile = path.join(process.cwd(), `export-${country}.json`);
|
||||
const data = JSON.parse(await fs.readFile(exportFile, 'utf-8'));
|
||||
console.log(`Loaded ${data.length} churches from export file`);
|
||||
|
||||
// Connect to Neon
|
||||
const neonPrisma = new PrismaClient();
|
||||
|
||||
try {
|
||||
await neonPrisma.$connect();
|
||||
console.log('✅ Connected to Neon database');
|
||||
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
let errors = 0;
|
||||
|
||||
for (const church of data) {
|
||||
try {
|
||||
const massSchedules = church.massSchedules || [];
|
||||
const confessionSchedules = church.confessionSchedules || [];
|
||||
const adorationSchedules = church.adorationSchedules || [];
|
||||
|
||||
// Remove relations and ids
|
||||
delete church.massSchedules;
|
||||
delete church.confessionSchedules;
|
||||
delete church.adorationSchedules;
|
||||
delete church.id;
|
||||
|
||||
if (!dryRun) {
|
||||
// Upsert church based on coordinates
|
||||
const result = await neonPrisma.churches.upsert({
|
||||
where: {
|
||||
latitude_longitude: {
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude
|
||||
}
|
||||
},
|
||||
create: church,
|
||||
update: church
|
||||
});
|
||||
|
||||
// Check if it was an insert or update
|
||||
const existing = await neonPrisma.churches.findFirst({
|
||||
where: {
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
createdAt: { lt: new Date(Date.now() - 1000) } // Created more than 1 sec ago
|
||||
}
|
||||
});
|
||||
|
||||
if (existing) {
|
||||
updated++;
|
||||
} else {
|
||||
inserted++;
|
||||
}
|
||||
|
||||
// Insert schedules
|
||||
for (const schedule of massSchedules) {
|
||||
delete schedule.id;
|
||||
await neonPrisma.massSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (const schedule of confessionSchedules) {
|
||||
delete schedule.id;
|
||||
await neonPrisma.confessionSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (const schedule of adorationSchedules) {
|
||||
delete schedule.id;
|
||||
await neonPrisma.adorationSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Dry run - just count
|
||||
inserted++;
|
||||
}
|
||||
|
||||
if (inserted % 100 === 0) {
|
||||
console.log(`Progress: ${inserted + updated} churches processed...`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
errors++;
|
||||
console.error(`Error importing church ${church.name}:`, error instanceof Error ? error.message : error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n✅ Import complete!');
|
||||
console.log(` - ${inserted} churches inserted`);
|
||||
console.log(` - ${updated} churches updated`);
|
||||
console.log(` - ${errors} errors`);
|
||||
|
||||
await neonPrisma.$disconnect();
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Import failed:', error);
|
||||
await neonPrisma.$disconnect();
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const country = process.argv[2] || 'PL';
|
||||
const mode = process.argv[3] || 'dry-run';
|
||||
const dryRun = mode === 'dry-run';
|
||||
|
||||
console.log('🌍 Export/Import to Neon');
|
||||
console.log('========================\n');
|
||||
|
||||
try {
|
||||
// Step 1: Export from NAS
|
||||
const stats = await exportFromNAS(country);
|
||||
|
||||
// Step 2: Import to Neon
|
||||
await importToNeon(country, dryRun);
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n💡 This was a DRY RUN. To actually import to Neon, run:');
|
||||
console.log(` npx tsx scripts/export-import-to-neon.ts ${country} real-import`);
|
||||
} else {
|
||||
console.log('\n🎉 Data successfully uploaded to Neon!');
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Process failed:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
41
scripts/debug/find-donnerstag-sections.ts
Normal file
41
scripts/debug/find-donnerstag-sections.ts
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Find all instances of "donnerstag" (Thursday)
|
||||
let idx = 0;
|
||||
let count = 0;
|
||||
while ((idx = text.indexOf('donnerstag', idx)) !== -1) {
|
||||
count++;
|
||||
const contextBefore = text.substring(Math.max(0, idx - 100), idx);
|
||||
const contextAfter = text.substring(idx, Math.min(text.length, idx + 200));
|
||||
|
||||
console.log(`=== Donnerstag occurrence ${count} at position ${idx} ===`);
|
||||
console.log(`BEFORE: "...${contextBefore}"`);
|
||||
console.log(`AFTER: "${contextAfter}..."`);
|
||||
console.log('');
|
||||
|
||||
idx++;
|
||||
}
|
||||
|
||||
console.log(`Total "donnerstag" occurrences: ${count}`);
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
42
scripts/debug/find-office-hours-pattern.ts
Normal file
42
scripts/debug/find-office-hours-pattern.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env tsx
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('DE');
|
||||
|
||||
const result = await scraper.scrape('https://www.alterpeter.de/');
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
const idx = text.indexOf('9.00 – 12.00');
|
||||
if (idx !== -1) {
|
||||
console.log('Context around "9.00 – 12.00":');
|
||||
console.log(text.substring(Math.max(0, idx - 150), idx + 200));
|
||||
} else {
|
||||
console.log('Pattern "9.00 – 12.00" not found');
|
||||
|
||||
// Try alternative patterns
|
||||
const patterns = ['9.00', '9:00', '09:00', '09.00'];
|
||||
for (const pattern of patterns) {
|
||||
const idx2 = text.indexOf(pattern);
|
||||
if (idx2 !== -1) {
|
||||
console.log(`\nFound "${pattern}" at position ${idx2}:`);
|
||||
console.log(text.substring(Math.max(0, idx2 - 100), idx2 + 150));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
102
scripts/debug/identify-top5-bugs.ts
Normal file
102
scripts/debug/identify-top5-bugs.ts
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Identify which churches are flagged as "parsing bugs" in top 5 test
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const COUNTRIES = [
|
||||
{ code: 'FR', name: 'France' },
|
||||
{ code: 'DE', name: 'Germany' },
|
||||
{ code: 'ES', name: 'Spain' },
|
||||
{ code: 'PL', name: 'Poland' },
|
||||
{ code: 'BR', name: 'Brazil' },
|
||||
];
|
||||
|
||||
async function identifyBugs() {
|
||||
console.log('Identifying "parsing bugs" from top 5 test...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
const bugs: Array<{
|
||||
country: string;
|
||||
church: string;
|
||||
url: string;
|
||||
hasDays: boolean;
|
||||
hasTimes: boolean;
|
||||
}> = [];
|
||||
|
||||
for (const country of COUNTRIES) {
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: country.code,
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: 10,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
scraper.setCountry(country.code);
|
||||
|
||||
for (const church of churches) {
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
if (!result.success && result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check for day names and times
|
||||
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|domingos|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|sábados|sabados|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|segundas|terça|terca|terças|tercas|quarta|quartas|quinta|quintas|sexta|sextas)\b/i);
|
||||
|
||||
const hasTimes = text.match(/\d{1,2}[h:\.]?\s*\d{0,2}\s*(am|pm|h|uhr)?/i);
|
||||
|
||||
if (hasDays && hasTimes) {
|
||||
bugs.push({
|
||||
country: country.name,
|
||||
church: church.name,
|
||||
url: church.website!,
|
||||
hasDays: !!hasDays,
|
||||
hasTimes: !!hasTimes,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
// Skip errors
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log(`FOUND ${bugs.length} POTENTIAL PARSING BUGS\n`);
|
||||
|
||||
bugs.forEach((bug, i) => {
|
||||
console.log(`${i + 1}. ${bug.church} (${bug.country})`);
|
||||
console.log(` URL: ${bug.url}`);
|
||||
console.log('');
|
||||
});
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
identifyBugs().catch(console.error);
|
||||
232
scripts/debug/import-to-neon.ts
Normal file
232
scripts/debug/import-to-neon.ts
Normal file
@@ -0,0 +1,232 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Import churches from JSON export to Neon database
|
||||
* Run this LOCALLY (uses DATABASE_URL from .env pointing to Neon)
|
||||
*/
|
||||
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
interface ChurchExport {
|
||||
id: string;
|
||||
name: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
country: string;
|
||||
massSchedules?: any[];
|
||||
confessionSchedules?: any[];
|
||||
adorationSchedules?: any[];
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const country = process.argv[2] || 'PL';
|
||||
const mode = process.argv[3] || 'dry-run';
|
||||
const dryRun = mode === 'dry-run';
|
||||
|
||||
console.log(`📤 Importing ${country} data to Neon...`);
|
||||
console.log(`DATABASE_URL: ${process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@')}`);
|
||||
|
||||
if (dryRun) {
|
||||
console.log('🔍 DRY RUN MODE - No data will be written');
|
||||
}
|
||||
|
||||
// Read export file
|
||||
const exportFile = path.join(process.cwd(), `export-${country}.json`);
|
||||
|
||||
try {
|
||||
const data: ChurchExport[] = JSON.parse(await fs.readFile(exportFile, 'utf-8'));
|
||||
console.log(`Loaded ${data.length} churches from export file`);
|
||||
|
||||
// Connect to Neon
|
||||
const prisma = new PrismaClient();
|
||||
|
||||
try {
|
||||
await prisma.$connect();
|
||||
console.log('✅ Connected to Neon database');
|
||||
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
let skipped = 0;
|
||||
let errors = 0;
|
||||
let totalMassSchedules = 0;
|
||||
let totalConfessionSchedules = 0;
|
||||
let totalAdorationSchedules = 0;
|
||||
|
||||
for (const church of data) {
|
||||
try {
|
||||
const massSchedules = church.massSchedules || [];
|
||||
const confessionSchedules = church.confessionSchedules || [];
|
||||
const adorationSchedules = church.adorationSchedules || [];
|
||||
|
||||
// Remove relations and ids
|
||||
delete church.massSchedules;
|
||||
delete church.confessionSchedules;
|
||||
delete church.adorationSchedules;
|
||||
delete church.id;
|
||||
|
||||
if (!dryRun) {
|
||||
// Check if church already exists
|
||||
const existing = await prisma.churches.findFirst({
|
||||
where: {
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude
|
||||
}
|
||||
});
|
||||
|
||||
if (existing) {
|
||||
// Update existing church
|
||||
await prisma.churches.update({
|
||||
where: { id: existing.id },
|
||||
data: church
|
||||
});
|
||||
|
||||
// Delete existing schedules
|
||||
await prisma.massSchedules.deleteMany({
|
||||
where: { churchId: existing.id }
|
||||
});
|
||||
await prisma.confessionSchedules.deleteMany({
|
||||
where: { churchId: existing.id }
|
||||
});
|
||||
await prisma.adorationSchedules.deleteMany({
|
||||
where: { churchId: existing.id }
|
||||
});
|
||||
|
||||
// Insert new schedules
|
||||
for (const schedule of massSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.massSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: existing.id
|
||||
}
|
||||
});
|
||||
totalMassSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of confessionSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.confessionSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: existing.id
|
||||
}
|
||||
});
|
||||
totalConfessionSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of adorationSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.adorationSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: existing.id
|
||||
}
|
||||
});
|
||||
totalAdorationSchedules++;
|
||||
}
|
||||
|
||||
updated++;
|
||||
} else {
|
||||
// Create new church
|
||||
const result = await prisma.churches.create({
|
||||
data: church
|
||||
});
|
||||
|
||||
// Insert schedules
|
||||
for (const schedule of massSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.massSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
totalMassSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of confessionSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.confessionSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
totalConfessionSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of adorationSchedules) {
|
||||
delete schedule.id;
|
||||
await prisma.adorationSchedules.create({
|
||||
data: {
|
||||
...schedule,
|
||||
churchId: result.id
|
||||
}
|
||||
});
|
||||
totalAdorationSchedules++;
|
||||
}
|
||||
|
||||
inserted++;
|
||||
}
|
||||
} else {
|
||||
// Dry run - just count
|
||||
inserted++;
|
||||
totalMassSchedules += massSchedules.length;
|
||||
totalConfessionSchedules += confessionSchedules.length;
|
||||
totalAdorationSchedules += adorationSchedules.length;
|
||||
}
|
||||
|
||||
if ((inserted + updated) % 100 === 0) {
|
||||
console.log(`Progress: ${inserted + updated} churches processed...`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
errors++;
|
||||
console.error(`Error importing church ${church.name}:`, error instanceof Error ? error.message : error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n✅ Import complete!');
|
||||
console.log(` - ${inserted} churches inserted`);
|
||||
console.log(` - ${updated} churches updated`);
|
||||
console.log(` - ${skipped} churches skipped`);
|
||||
console.log(` - ${errors} errors`);
|
||||
console.log(` - ${totalMassSchedules} mass schedules`);
|
||||
console.log(` - ${totalConfessionSchedules} confession schedules`);
|
||||
console.log(` - ${totalAdorationSchedules} adoration schedules`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
|
||||
if (dryRun) {
|
||||
console.log('\n💡 This was a DRY RUN. To actually import to Neon, run:');
|
||||
console.log(` npx tsx scripts/import-to-neon.ts ${country} real-import`);
|
||||
} else {
|
||||
console.log('\n🎉 Data successfully uploaded to Neon!');
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Import failed:', error);
|
||||
await prisma.$disconnect();
|
||||
throw error;
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
|
||||
console.error(`❌ Export file not found: ${exportFile}`);
|
||||
console.error(`\nFirst, export data from NAS:`);
|
||||
console.error(` ssh albert@192.168.0.145`);
|
||||
console.error(` cd /volume1/docker/nearestmass`);
|
||||
console.error(` /usr/local/bin/docker compose --profile tools run --rm scraper npx tsx scripts/export-from-nas.ts ${country}`);
|
||||
console.error(`\nThen download the export:`);
|
||||
console.error(` scp albert@192.168.0.145:/volume1/docker/nearestmass/export-${country}.json .`);
|
||||
console.error(`\nFinally, run this import script again.`);
|
||||
} else {
|
||||
console.error('❌ Process failed:', error);
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
84
scripts/debug/investigate-8-bugs.ts
Normal file
84
scripts/debug/investigate-8-bugs.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Investigate the 8 potential parsing bugs
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const BUGS = [
|
||||
{ name: 'Chapelle Saint-Jean-XXIII', country: 'FR', url: 'https://www.chemin-neuf.fr/' },
|
||||
{ name: 'St. Marien', country: 'DE', url: 'https://www.willehad.de/start/' },
|
||||
{ name: 'Iglesia de San Fernando', country: 'ES', url: 'https://www.parroquiasanfernandomaspalomas.net/de/' },
|
||||
{ name: 'Monestir de Sant Esperit', country: 'ES', url: 'https://www.santoespiritu.org/' },
|
||||
{ name: 'Santuario de Manalagua', country: 'ES', url: 'http://tierrasdeburgos.blogspot.com.es/2013/12/escultura-del-agua-santuario-de.html' },
|
||||
{ name: 'Kościół pw. Najświętszego Serca', country: 'PL', url: 'http://parafialubojna.pl' },
|
||||
{ name: 'Paróquia do Desterro', country: 'BR', url: 'https://paroquiaportodegalinhas.blogspot.com.br/' },
|
||||
{ name: 'Catedral Diocesana', country: 'BR', url: 'http://diocesedejuazeiro.org.br/' },
|
||||
];
|
||||
|
||||
async function investigate() {
|
||||
console.log('Investigating 8 potential bugs...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
for (let i = 0; i < BUGS.length; i++) {
|
||||
const bug = BUGS[i];
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
console.log(`${i + 1}. ${bug.name} (${bug.country})`);
|
||||
console.log(` ${bug.url}`);
|
||||
console.log('='.repeat(80));
|
||||
|
||||
scraper.setCountry(bug.country);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(bug.url);
|
||||
|
||||
console.log(`Success: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}`);
|
||||
console.log(`Error: ${result.error || 'none'}`);
|
||||
|
||||
if (!result.success && result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check page type
|
||||
console.log('\nPage analysis:');
|
||||
if (text.includes('blogspot')) {
|
||||
console.log(' ⚠️ Blogspot page (likely blog post, not church website)');
|
||||
}
|
||||
if (text.includes('hotel') || text.includes('reservation') || text.includes('booking')) {
|
||||
console.log(' ⚠️ Contains hotel/booking keywords');
|
||||
}
|
||||
if (text.includes('restaurant') || text.includes('menu')) {
|
||||
console.log(' ⚠️ Contains restaurant keywords');
|
||||
}
|
||||
if (text.includes('404') || text.includes('not found') || text.includes('error')) {
|
||||
console.log(' ⚠️ Error/404 page');
|
||||
}
|
||||
|
||||
// Check if it has schedule keywords
|
||||
const hasScheduleKeywords = text.match(/(mass|messe|misa|missa|horário|horario|gottesdienst|eucarist)/i);
|
||||
console.log(` Schedule keywords: ${hasScheduleKeywords ? '✓ Found' : '✗ Not found'}`);
|
||||
|
||||
// Show sample text
|
||||
const massIndex = text.indexOf('mass') || text.indexOf('messe') || text.indexOf('misa') || text.indexOf('missa') || 0;
|
||||
const sampleStart = Math.max(0, massIndex - 50);
|
||||
const sample = text.substring(sampleStart, sampleStart + 300);
|
||||
console.log(`\n Sample text: "${sample.substring(0, 200)}..."`);
|
||||
}
|
||||
|
||||
console.log('\n');
|
||||
} catch (err: any) {
|
||||
console.log(`ERROR: ${err.message}\n\n`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
investigate().catch(console.error);
|
||||
134
scripts/debug/list-church-websites.ts
Normal file
134
scripts/debug/list-church-websites.ts
Normal file
@@ -0,0 +1,134 @@
|
||||
import { config } from 'dotenv';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
|
||||
// Load .env.local first, then .env
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
|
||||
if (!connectionString) {
|
||||
throw new Error('DATABASE_URL environment variable is not set');
|
||||
}
|
||||
|
||||
const pool = new Pool({ connectionString });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function listChurchWebsites() {
|
||||
try {
|
||||
console.log('Fetching churches from database...\n');
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
city: true,
|
||||
state: true,
|
||||
country: true,
|
||||
website: true,
|
||||
googlePlaceId: true,
|
||||
},
|
||||
orderBy: [
|
||||
{ country: 'asc' },
|
||||
{ state: 'asc' },
|
||||
{ city: 'asc' },
|
||||
],
|
||||
});
|
||||
|
||||
console.log(`Total churches: ${churches.length}`);
|
||||
|
||||
const withWebsite = churches.filter(c => c.website);
|
||||
const withGoogle = churches.filter(c => c.googlePlaceId);
|
||||
const withoutWebsite = churches.filter(c => !c.website);
|
||||
|
||||
console.log(`Churches with website: ${withWebsite.length}`);
|
||||
console.log(`Churches with Google Place ID: ${withGoogle.length}`);
|
||||
console.log(`Churches without website: ${withoutWebsite.length}\n`);
|
||||
|
||||
// Group by country
|
||||
const byCountry = churches.reduce((acc, church) => {
|
||||
const country = church.country || 'Unknown';
|
||||
if (!acc[country]) {
|
||||
acc[country] = [];
|
||||
}
|
||||
acc[country].push(church);
|
||||
return acc;
|
||||
}, {} as Record<string, typeof churches>);
|
||||
|
||||
// Write to file
|
||||
let output = '# Church Websites\n\n';
|
||||
output += `Generated: ${new Date().toISOString()}\n\n`;
|
||||
output += `## Summary\n`;
|
||||
output += `- Total churches: ${churches.length}\n`;
|
||||
output += `- With website: ${withWebsite.length} (${((withWebsite.length / churches.length) * 100).toFixed(1)}%)\n`;
|
||||
output += `- With Google Place ID: ${withGoogle.length} (${((withGoogle.length / churches.length) * 100).toFixed(1)}%)\n`;
|
||||
output += `- Without website: ${withoutWebsite.length} (${((withoutWebsite.length / churches.length) * 100).toFixed(1)}%)\n\n`;
|
||||
|
||||
// Add country breakdown
|
||||
output += `## By Country\n\n`;
|
||||
Object.entries(byCountry)
|
||||
.sort(([, a], [, b]) => b.length - a.length)
|
||||
.forEach(([country, countryChurches]) => {
|
||||
const withSite = countryChurches.filter(c => c.website).length;
|
||||
const withGoogle = countryChurches.filter(c => c.googlePlaceId).length;
|
||||
output += `### ${country} (${countryChurches.length} churches)\n`;
|
||||
output += `- With website: ${withSite} (${((withSite / countryChurches.length) * 100).toFixed(1)}%)\n`;
|
||||
output += `- With Google Place ID: ${withGoogle} (${((withGoogle / countryChurches.length) * 100).toFixed(1)}%)\n\n`;
|
||||
});
|
||||
|
||||
// List all websites
|
||||
output += `## All Websites\n\n`;
|
||||
Object.entries(byCountry)
|
||||
.sort(([a], [b]) => a.localeCompare(b))
|
||||
.forEach(([country, countryChurches]) => {
|
||||
output += `### ${country}\n\n`;
|
||||
countryChurches.forEach(church => {
|
||||
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
|
||||
if (church.website) {
|
||||
output += `- **${church.name}** (${location})\n`;
|
||||
output += ` - Website: ${church.website}\n`;
|
||||
if (church.googlePlaceId) {
|
||||
output += ` - Google Place ID: ${church.googlePlaceId}\n`;
|
||||
}
|
||||
output += ` - DB ID: ${church.id}\n\n`;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// List churches without websites
|
||||
output += `## Churches Without Websites\n\n`;
|
||||
Object.entries(byCountry)
|
||||
.sort(([a], [b]) => a.localeCompare(b))
|
||||
.forEach(([country, countryChurches]) => {
|
||||
const without = countryChurches.filter(c => !c.website);
|
||||
if (without.length > 0) {
|
||||
output += `### ${country}\n\n`;
|
||||
without.forEach(church => {
|
||||
const location = [church.city, church.state, church.country].filter(Boolean).join(', ');
|
||||
output += `- **${church.name}** (${location})\n`;
|
||||
if (church.googlePlaceId) {
|
||||
output += ` - Google Place ID: ${church.googlePlaceId}\n`;
|
||||
}
|
||||
output += ` - DB ID: ${church.id}\n\n`;
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Write to file
|
||||
const fs = await import('fs/promises');
|
||||
await fs.writeFile('church-websites.md', output);
|
||||
console.log('✓ Written to church-websites.md');
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
listChurchWebsites();
|
||||
44
scripts/debug/list-tables.ts
Normal file
44
scripts/debug/list-tables.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import { Pool } from 'pg';
|
||||
import * as dotenv from 'dotenv';
|
||||
import * as path from 'path';
|
||||
|
||||
// Load .env.local first (takes precedence), then .env
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
});
|
||||
|
||||
async function listTables() {
|
||||
try {
|
||||
console.log('Connecting to database...');
|
||||
console.log('DATABASE_URL:', process.env.DATABASE_URL?.replace(/:[^:@]+@/, ':****@'));
|
||||
|
||||
// List all tables
|
||||
const result = await pool.query(`
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = 'public'
|
||||
ORDER BY table_name;
|
||||
`);
|
||||
|
||||
console.log('\n=== Tables in Database ===');
|
||||
if (result.rows.length === 0) {
|
||||
console.log('No tables found!');
|
||||
} else {
|
||||
result.rows.forEach((row) => {
|
||||
console.log(`- ${row.table_name}`);
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`\nTotal: ${result.rows.length} tables`);
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error listing tables:', error);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
listTables();
|
||||
167
scripts/debug/pipeline-report.js
Normal file
167
scripts/debug/pipeline-report.js
Normal file
@@ -0,0 +1,167 @@
|
||||
const { Client } = require("pg");
|
||||
const client = new Client({
|
||||
connectionString: "postgresql://postgres:postgres@192.168.0.145:5434/nearestmass"
|
||||
});
|
||||
|
||||
const queries = [
|
||||
{
|
||||
name: "1. Overall church counts by country (top 20)",
|
||||
sql: `SELECT country, COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
|
||||
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped,
|
||||
COUNT(*) FILTER (WHERE has_website = true) as has_website_flag,
|
||||
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_language
|
||||
FROM churches
|
||||
GROUP BY country
|
||||
ORDER BY total DESC
|
||||
LIMIT 20`
|
||||
},
|
||||
{
|
||||
name: "2. Total mass schedule counts",
|
||||
sql: `SELECT COUNT(*) as total_schedules,
|
||||
COUNT(DISTINCT church_id) as churches_with_schedules
|
||||
FROM mass_schedules`
|
||||
},
|
||||
{
|
||||
name: "3. Scrape results by language",
|
||||
sql: `SELECT website_language as language,
|
||||
COUNT(*) as total_scraped,
|
||||
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
|
||||
FROM churches
|
||||
WHERE website_language IS NOT NULL
|
||||
GROUP BY website_language
|
||||
ORDER BY total_scraped DESC`
|
||||
},
|
||||
{
|
||||
name: "4. Churches with websites but never scraped",
|
||||
sql: `SELECT COUNT(*) as has_website_not_scraped
|
||||
FROM churches
|
||||
WHERE website IS NOT NULL AND last_scraped_at IS NULL`
|
||||
},
|
||||
{
|
||||
name: "5. Overall pipeline funnel",
|
||||
sql: `SELECT
|
||||
COUNT(*) as total_churches,
|
||||
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
|
||||
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as attempted_scrape,
|
||||
COUNT(*) FILTER (WHERE website_language IS NOT NULL) as has_detected_language,
|
||||
(SELECT COUNT(DISTINCT church_id) FROM mass_schedules) as has_schedules_saved,
|
||||
(SELECT COUNT(*) FROM mass_schedules) as total_schedule_rows
|
||||
FROM churches`
|
||||
},
|
||||
{
|
||||
name: "6. Recent scrape activity (last 7 days) by language",
|
||||
sql: `SELECT website_language as language,
|
||||
COUNT(*) as scraped_last_7d
|
||||
FROM churches
|
||||
WHERE last_scraped_at > NOW() - INTERVAL '7 days'
|
||||
GROUP BY website_language
|
||||
ORDER BY scraped_last_7d DESC`
|
||||
},
|
||||
{
|
||||
name: "7. Background job history (last 15 completed/failed jobs)",
|
||||
sql: `SELECT type, language, status,
|
||||
created_at::date as created,
|
||||
completed_at::date as completed,
|
||||
ROUND(CAST(EXTRACT(EPOCH FROM (completed_at - created_at))/3600 AS numeric), 2) as hours,
|
||||
total_items, processed, succeeded, failed
|
||||
FROM background_jobs
|
||||
WHERE status IN ('completed', 'failed')
|
||||
ORDER BY completed_at DESC
|
||||
LIMIT 15`
|
||||
},
|
||||
{
|
||||
name: "8. Mass schedule breakdown by day of week",
|
||||
sql: `SELECT day_of_week,
|
||||
CASE day_of_week
|
||||
WHEN 0 THEN 'Sunday' WHEN 1 THEN 'Monday' WHEN 2 THEN 'Tuesday'
|
||||
WHEN 3 THEN 'Wednesday' WHEN 4 THEN 'Thursday' WHEN 5 THEN 'Friday'
|
||||
WHEN 6 THEN 'Saturday' ELSE 'Other'
|
||||
END as day_name,
|
||||
COUNT(*) as count
|
||||
FROM mass_schedules
|
||||
GROUP BY day_of_week
|
||||
ORDER BY day_of_week`
|
||||
},
|
||||
{
|
||||
name: "9. Churches with schedules by country (top 15)",
|
||||
sql: `SELECT c.country,
|
||||
COUNT(DISTINCT c.id) as total_churches,
|
||||
COUNT(DISTINCT ms.church_id) as churches_with_schedules,
|
||||
ROUND(100.0 * COUNT(DISTINCT ms.church_id) / NULLIF(COUNT(DISTINCT c.id), 0), 1) as coverage_pct,
|
||||
COUNT(ms.id) as total_schedule_rows
|
||||
FROM churches c
|
||||
LEFT JOIN mass_schedules ms ON ms.church_id = c.id
|
||||
GROUP BY c.country
|
||||
ORDER BY total_churches DESC
|
||||
LIMIT 15`
|
||||
},
|
||||
{
|
||||
name: "10. Enrichment sources - how churches were found",
|
||||
sql: `SELECT source, COUNT(*) as count
|
||||
FROM churches
|
||||
GROUP BY source
|
||||
ORDER BY count DESC`
|
||||
},
|
||||
{
|
||||
name: "11. Google Places enrichment impact",
|
||||
sql: `SELECT
|
||||
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
|
||||
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL AND website IS NOT NULL) as google_with_website,
|
||||
COUNT(*) FILTER (WHERE google_place_id IS NULL) as no_google_place,
|
||||
COUNT(*) FILTER (WHERE google_searched_at IS NOT NULL) as google_searched,
|
||||
COUNT(*) FILTER (WHERE free_searched_at IS NOT NULL) as free_searched
|
||||
FROM churches`
|
||||
},
|
||||
{
|
||||
name: "12. Website presence by source",
|
||||
sql: `SELECT source,
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE website IS NOT NULL) as has_website,
|
||||
ROUND(100.0 * COUNT(*) FILTER (WHERE website IS NOT NULL) / NULLIF(COUNT(*), 0), 1) as website_pct,
|
||||
COUNT(*) FILTER (WHERE google_place_id IS NOT NULL) as has_google_place,
|
||||
COUNT(*) FILTER (WHERE last_scraped_at IS NOT NULL) as scraped
|
||||
FROM churches
|
||||
GROUP BY source
|
||||
ORDER BY total DESC`
|
||||
}
|
||||
];
|
||||
|
||||
async function run() {
|
||||
await client.connect();
|
||||
|
||||
for (const q of queries) {
|
||||
console.log("=".repeat(90));
|
||||
console.log(q.name);
|
||||
console.log("=".repeat(90));
|
||||
try {
|
||||
const res = await client.query(q.sql);
|
||||
if (res.rows.length === 0) {
|
||||
console.log("(no rows returned)");
|
||||
} else {
|
||||
// Calculate column widths
|
||||
const cols = Object.keys(res.rows[0]);
|
||||
const widths = cols.map(c => {
|
||||
const maxData = Math.max(...res.rows.map(r => String(r[c] ?? "NULL").length));
|
||||
return Math.max(c.length, maxData);
|
||||
});
|
||||
|
||||
// Print header
|
||||
console.log(cols.map((c, i) => c.padEnd(widths[i])).join(" | "));
|
||||
console.log(widths.map(w => "-".repeat(w)).join("-+-"));
|
||||
|
||||
// Print rows
|
||||
for (const row of res.rows) {
|
||||
console.log(cols.map((c, i) => String(row[c] ?? "NULL").padEnd(widths[i])).join(" | "));
|
||||
}
|
||||
}
|
||||
console.log("\n(" + res.rows.length + " rows)\n");
|
||||
} catch (err) {
|
||||
console.log("ERROR:", err.message, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
await client.end();
|
||||
}
|
||||
|
||||
run().catch(e => { console.error(e); process.exit(1); });
|
||||
48
scripts/debug/show-french-success.ts
Normal file
48
scripts/debug/show-french-success.ts
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Show detailed output from a successful French parse
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function showSuccess() {
|
||||
// One of our successful churches with 16 schedules
|
||||
const url = 'https://laportelatine.org/lieux/couvent-saint-francois-morgon';
|
||||
console.log(`Detailed parse of: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`✅ Success: ${result.success}`);
|
||||
console.log(`📅 Schedules found: ${result.schedules.length}\n`);
|
||||
|
||||
// Group by day
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Dimanche', 'Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi'];
|
||||
|
||||
console.log('═══════════════════════════════════════════════');
|
||||
console.log('PARSED SCHEDULE:');
|
||||
console.log('═══════════════════════════════════════════════\n');
|
||||
|
||||
Object.entries(byDay)
|
||||
.sort(([a], [b]) => parseInt(a) - parseInt(b))
|
||||
.forEach(([day, scheds]) => {
|
||||
console.log(`${dayNames[parseInt(day)]}:`);
|
||||
scheds.forEach(s => {
|
||||
console.log(` ${s.time} - ${s.language} ${s.massType}`);
|
||||
});
|
||||
console.log('');
|
||||
});
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
showSuccess().catch(console.error);
|
||||
28
scripts/debug/test-db-connection.ts
Normal file
28
scripts/debug/test-db-connection.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test database connection
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
console.log('DATABASE_URL exists:', !!process.env.DATABASE_URL);
|
||||
console.log('DATABASE_URL value:', process.env.DATABASE_URL?.substring(0, 50) + '...');
|
||||
|
||||
import { prisma } from '../../src/lib/db';
|
||||
|
||||
async function testConnection() {
|
||||
try {
|
||||
const count = await prisma.church.count();
|
||||
console.log(`✅ Database connection successful!`);
|
||||
console.log(`Total churches in database: ${count}`);
|
||||
} catch (err: any) {
|
||||
console.log(`❌ Database connection failed:`);
|
||||
console.log(err.message);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
testConnection();
|
||||
180
scripts/debug/test-french-broader.ts
Normal file
180
scripts/debug/test-french-broader.ts
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test more French churches and collect diagnostic data
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
interface DiagnosticInfo {
|
||||
url: string;
|
||||
churchName: string;
|
||||
success: boolean;
|
||||
schedulesFound: number;
|
||||
hasFrenchDays: boolean;
|
||||
hasTimePatterns: boolean;
|
||||
timePatternsSample: string[];
|
||||
textSample: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function testFrenchBroader() {
|
||||
console.log('Testing 20 French churches with diagnostics...\n');
|
||||
|
||||
// Get more French churches
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: 'FR',
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: 20,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (churches.length === 0) {
|
||||
console.log('No French churches found.');
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Found ${churches.length} French churches to test\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
let successCount = 0;
|
||||
let failCount = 0;
|
||||
const diagnostics: DiagnosticInfo[] = [];
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name} (${church.city || 'Unknown'})`);
|
||||
console.log(`URL: ${church.website}`);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
// Extract diagnostics
|
||||
let hasFrenchDays = false;
|
||||
let hasTimePatterns = false;
|
||||
let timePatternsSample: string[] = [];
|
||||
let textSample = '';
|
||||
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
textSample = text.substring(0, 500);
|
||||
|
||||
const frenchDays = ['dimanche', 'lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi'];
|
||||
hasFrenchDays = frenchDays.some(day => text.includes(day));
|
||||
|
||||
const timeRegex = /\d{1,2}[h:\.]\s*\d{0,2}\s*(?:h)?/g;
|
||||
const times = text.match(timeRegex);
|
||||
if (times) {
|
||||
hasTimePatterns = true;
|
||||
timePatternsSample = [...new Set(times)].slice(0, 10);
|
||||
}
|
||||
}
|
||||
|
||||
const diagnostic: DiagnosticInfo = {
|
||||
url: church.website!,
|
||||
churchName: church.name,
|
||||
success: result.success,
|
||||
schedulesFound: result.schedules.length,
|
||||
hasFrenchDays,
|
||||
hasTimePatterns,
|
||||
timePatternsSample,
|
||||
textSample,
|
||||
error: result.error,
|
||||
};
|
||||
|
||||
diagnostics.push(diagnostic);
|
||||
|
||||
if (result.success && result.schedules.length > 0) {
|
||||
successCount++;
|
||||
console.log(`✅ SUCCESS - ${result.schedules.length} schedules`);
|
||||
} else {
|
||||
failCount++;
|
||||
console.log(`❌ FAILED - ${result.error}`);
|
||||
if (hasFrenchDays && !hasTimePatterns) {
|
||||
console.log(` 💡 Has French days but no times`);
|
||||
} else if (!hasFrenchDays && hasTimePatterns) {
|
||||
console.log(` 💡 Has times but no French days`);
|
||||
} else if (hasFrenchDays && hasTimePatterns) {
|
||||
console.log(` 💡 Has BOTH days and times - parsing issue!`);
|
||||
console.log(` Sample times: ${timePatternsSample.slice(0, 5).join(', ')}`);
|
||||
} else {
|
||||
console.log(` 💡 No mass schedule content found`);
|
||||
}
|
||||
}
|
||||
console.log('');
|
||||
} catch (err: any) {
|
||||
failCount++;
|
||||
console.log(`❌ ERROR - ${err.message}\n`);
|
||||
diagnostics.push({
|
||||
url: church.website!,
|
||||
churchName: church.name,
|
||||
success: false,
|
||||
schedulesFound: 0,
|
||||
hasFrenchDays: false,
|
||||
hasTimePatterns: false,
|
||||
timePatternsSample: [],
|
||||
textSample: '',
|
||||
error: err.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
// Analysis
|
||||
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
|
||||
console.log('');
|
||||
|
||||
const hasBoth = diagnostics.filter(d => !d.success && d.hasFrenchDays && d.hasTimePatterns);
|
||||
const hasDaysNoTimes = diagnostics.filter(d => !d.success && d.hasFrenchDays && !d.hasTimePatterns);
|
||||
const hasTimesNoDays = diagnostics.filter(d => !d.success && !d.hasFrenchDays && d.hasTimePatterns);
|
||||
const hasNeither = diagnostics.filter(d => !d.success && !d.hasFrenchDays && !d.hasTimePatterns);
|
||||
|
||||
console.log('FAILURE ANALYSIS:');
|
||||
console.log(` Has days + times but failed: ${hasBoth.length} (PARSING BUG)`);
|
||||
console.log(` Has days but no times: ${hasDaysNoTimes.length}`);
|
||||
console.log(` Has times but no days: ${hasTimesNoDays.length}`);
|
||||
console.log(` Has neither: ${hasNeither.length} (no mass schedule on page)`);
|
||||
console.log('');
|
||||
|
||||
if (hasBoth.length > 0) {
|
||||
console.log('⚠️ PARSING BUGS TO FIX (has both days and times but failed):');
|
||||
hasBoth.forEach(d => {
|
||||
console.log(` ${d.churchName}`);
|
||||
console.log(` URL: ${d.url}`);
|
||||
console.log(` Sample times found: ${d.timePatternsSample.slice(0, 5).join(', ')}`);
|
||||
console.log(` Text sample: ${d.textSample.substring(0, 150)}...`);
|
||||
console.log('');
|
||||
});
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
testFrenchBroader().catch(console.error);
|
||||
100
scripts/debug/test-french-scraper.ts
Executable file
100
scripts/debug/test-french-scraper.ts
Executable file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test international scraper against French churches
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function testFrenchScraper() {
|
||||
console.log('Testing French church mass schedule scraping...\n');
|
||||
|
||||
// Get French churches with websites
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: 'FR',
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: 5,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (churches.length === 0) {
|
||||
console.log('No French churches with websites found.');
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Found ${churches.length} French churches to test:\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('FR');
|
||||
|
||||
let successCount = 0;
|
||||
let failCount = 0;
|
||||
|
||||
for (const church of churches) {
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`Church: ${church.name}`);
|
||||
console.log(`City: ${church.city || 'Unknown'}`);
|
||||
console.log(`URL: ${church.website}`);
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
if (result.success && result.schedules.length > 0) {
|
||||
successCount++;
|
||||
console.log(`✅ SUCCESS - Found ${result.schedules.length} schedules\n`);
|
||||
|
||||
// Group by day and show
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Dimanche', 'Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi'];
|
||||
Object.entries(byDay).forEach(([day, scheds]) => {
|
||||
console.log(` ${dayNames[parseInt(day)]}:`);
|
||||
scheds.forEach(s => {
|
||||
console.log(` ${s.time} - ${s.language || 'Unknown'} (${s.massType || 'Mass'})`);
|
||||
});
|
||||
});
|
||||
console.log('');
|
||||
} else {
|
||||
failCount++;
|
||||
console.log(`❌ FAILED - ${result.error}`);
|
||||
console.log('');
|
||||
}
|
||||
} catch (err: any) {
|
||||
failCount++;
|
||||
console.log(`❌ ERROR - ${err.message}`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
||||
console.log(`\nRESULTS: ${successCount}/${churches.length} successful (${((successCount / churches.length) * 100).toFixed(0)}%)`);
|
||||
console.log(`Success: ${successCount}, Failed: ${failCount}\n`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
testFrenchScraper().catch(console.error);
|
||||
210
scripts/debug/test-international-sample.ts
Normal file
210
scripts/debug/test-international-sample.ts
Normal file
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test scraper on a diverse sample of international churches
|
||||
* to identify edge cases across different languages and formats
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
interface TestChurch {
|
||||
name: string;
|
||||
url: string;
|
||||
country: string;
|
||||
language: string;
|
||||
expectedDays?: string; // e.g., "Sun-Sat" or "Sun, Wed, Sat"
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
// Sample churches from different countries/languages
|
||||
const testChurches: TestChurch[] = [
|
||||
// FRENCH
|
||||
{
|
||||
name: 'Saint-Étienne du Mont, Paris',
|
||||
url: 'https://www.saintetiennedumontparis.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'French format with "du lundi au vendredi"',
|
||||
},
|
||||
{
|
||||
name: 'Notre-Dame de la Garde, Marseille',
|
||||
url: 'https://www.notredamedelagarde.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'Major pilgrimage site',
|
||||
},
|
||||
|
||||
// GERMAN
|
||||
{
|
||||
name: 'St. Peter, Munich',
|
||||
url: 'https://www.alterpeter.de/',
|
||||
country: 'DE',
|
||||
language: 'German',
|
||||
notes: 'German format with "bis" for ranges',
|
||||
},
|
||||
{
|
||||
name: 'Kölner Dom, Cologne',
|
||||
url: 'https://www.koelner-dom.de/',
|
||||
country: 'DE',
|
||||
language: 'German',
|
||||
notes: 'Cathedral with Uhr time format',
|
||||
},
|
||||
|
||||
// SPANISH
|
||||
{
|
||||
name: 'Sagrada Família, Barcelona',
|
||||
url: 'https://sagradafamilia.org/',
|
||||
country: 'ES',
|
||||
language: 'Spanish',
|
||||
notes: 'Major tourist site, may have complex schedule',
|
||||
},
|
||||
{
|
||||
name: 'Parroquia San Miguel, Madrid',
|
||||
url: 'https://www.parroquiasanmiguel.es/',
|
||||
country: 'ES',
|
||||
language: 'Spanish',
|
||||
notes: 'Spanish format with "de lunes a viernes"',
|
||||
},
|
||||
|
||||
// PORTUGUESE
|
||||
{
|
||||
name: 'Basílica da Estrela, Lisbon',
|
||||
url: 'https://www.basilicadaestrela.com/',
|
||||
country: 'PT',
|
||||
language: 'Portuguese',
|
||||
notes: 'Portuguese format',
|
||||
},
|
||||
|
||||
// ITALIAN
|
||||
{
|
||||
name: 'Santa Maria Maggiore, Rome',
|
||||
url: 'https://www.vatican.va/various/basiliche/sm_maggiore/index_it.htm',
|
||||
country: 'IT',
|
||||
language: 'Italian',
|
||||
notes: 'Major basilica',
|
||||
},
|
||||
{
|
||||
name: 'Duomo di Milano',
|
||||
url: 'https://www.duomomilano.it/',
|
||||
country: 'IT',
|
||||
language: 'Italian',
|
||||
notes: 'Cathedral with Italian format',
|
||||
},
|
||||
|
||||
// DUTCH
|
||||
{
|
||||
name: 'Basiliek van de H. Nicolaas, Amsterdam',
|
||||
url: 'https://www.nicolaas-parochie.nl/',
|
||||
country: 'NL',
|
||||
language: 'Dutch',
|
||||
notes: 'Dutch format with "tot" for ranges',
|
||||
},
|
||||
|
||||
// CZECH
|
||||
{
|
||||
name: 'Chrám sv. Víta, Prague',
|
||||
url: 'https://www.katedralasvatehovita.cz/',
|
||||
country: 'CZ',
|
||||
language: 'Czech',
|
||||
notes: 'Czech format',
|
||||
},
|
||||
|
||||
// HUNGARIAN
|
||||
{
|
||||
name: 'Szent István Bazilika, Budapest',
|
||||
url: 'https://www.bazilika.biz/',
|
||||
country: 'HU',
|
||||
language: 'Hungarian',
|
||||
notes: 'Hungarian format',
|
||||
},
|
||||
|
||||
// More complex cases
|
||||
{
|
||||
name: 'Cathédrale Notre-Dame, Strasbourg',
|
||||
url: 'https://www.cathedrale-strasbourg.fr/',
|
||||
country: 'FR',
|
||||
language: 'French',
|
||||
notes: 'Bilingual region (French/German)',
|
||||
},
|
||||
];
|
||||
|
||||
async function testChurch(church: TestChurch, scraper: GenericScraper): Promise<void> {
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log(`📍 ${church.name}`);
|
||||
console.log(` ${church.url}`);
|
||||
console.log(` Language: ${church.language} | Country: ${church.country}`);
|
||||
if (church.notes) console.log(` Notes: ${church.notes}`);
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
|
||||
try {
|
||||
scraper.setCountry(church.country);
|
||||
const result = await scraper.scrape(church.url);
|
||||
|
||||
if (!result.success) {
|
||||
console.log(`❌ FAILED: ${result.error || 'Unknown error'}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (result.schedules.length === 0) {
|
||||
console.log(`⚠️ SUCCESS but NO SCHEDULES found`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Group by day
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
console.log(`\n✅ Found ${result.schedules.length} schedules:\n`);
|
||||
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
const times = byDay[i].map(s => {
|
||||
let str = s.time;
|
||||
if (s.massType) str += ` (${s.massType})`;
|
||||
if (s.language && s.language !== 'English') str += ` [${s.language}]`;
|
||||
return str;
|
||||
}).join(', ');
|
||||
console.log(` ${dayNames[i]}: ${times}`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.log(`❌ ERROR: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
console.log('🌍 INTERNATIONAL CHURCH SCRAPER TEST');
|
||||
console.log(`Testing ${testChurches.length} churches across ${new Set(testChurches.map(c => c.country)).size} countries`);
|
||||
|
||||
const results: { success: number; failed: number; noSchedules: number } = {
|
||||
success: 0,
|
||||
failed: 0,
|
||||
noSchedules: 0,
|
||||
};
|
||||
|
||||
for (const church of testChurches) {
|
||||
await testChurch(church, scraper);
|
||||
|
||||
// Brief delay between requests to be respectful
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log('📊 SUMMARY');
|
||||
console.log(`${'='.repeat(80)}`);
|
||||
console.log(`Total tested: ${testChurches.length}`);
|
||||
console.log(`✅ Success with schedules: ${results.success}`);
|
||||
console.log(`⚠️ Success but no schedules: ${results.noSchedules}`);
|
||||
console.log(`❌ Failed: ${results.failed}`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
36
scripts/debug/test-masstimes-api.ts
Normal file
36
scripts/debug/test-masstimes-api.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
/**
|
||||
* Quick test script to verify the masstimes.org JSON API scraper works
|
||||
* Usage: npx tsx scripts/test-masstimes-api.ts
|
||||
*/
|
||||
|
||||
import { writeFileSync } from 'fs';
|
||||
import { MassTimesScraper } from '../../src/lib/masstimes-scraper';
|
||||
|
||||
async function main() {
|
||||
console.log('Testing MassTimes.org JSON API Scraper\n');
|
||||
|
||||
const scraper = new MassTimesScraper();
|
||||
|
||||
try {
|
||||
await scraper.init();
|
||||
console.log('Browser initialized\n');
|
||||
|
||||
const lat = 34.852;
|
||||
const lng = -82.394;
|
||||
console.log(`Fetching churches near Greenville, SC (${lat}, ${lng})...\n`);
|
||||
|
||||
const churches = await scraper.scrapeByLocation(lat, lng);
|
||||
|
||||
const outPath = 'scraped-churches.json';
|
||||
writeFileSync(outPath, JSON.stringify(churches, null, 2));
|
||||
console.log(`\nSaved ${churches.length} churches to ${outPath}`);
|
||||
|
||||
} catch (error) {
|
||||
console.error('TEST FAILED:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
70
scripts/debug/test-polish-sections.ts
Normal file
70
scripts/debug/test-polish-sections.ts
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test which sections are being created for Polish church
|
||||
*/
|
||||
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../../src/scrapers/i18n/day-names';
|
||||
|
||||
// Exact text from the page
|
||||
const text = `msze święte niedziela i uroczystości: 8 00 , 9 30 (lubojenka), 11 00 , 16 00 w lipcu i sierpniu nie ma mszy popołudniowej!--> dni powszednie: poniedziałek: godz. 8 00 wtorek - sobota: godz. 18 00`.toLowerCase();
|
||||
|
||||
console.log('Text:');
|
||||
console.log(text);
|
||||
console.log('\n');
|
||||
|
||||
const dayConfigs = getDayNamesForCountry('PL');
|
||||
const dayPatterns = buildDayPatterns(dayConfigs);
|
||||
const sortedDayNames = Object.keys(dayPatterns).sort((a, b) => b.length - a.length);
|
||||
const allDayNamesPattern = sortedDayNames.map(d => d.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
||||
|
||||
console.log('=== Testing individual day matching ===\n');
|
||||
|
||||
// Test niedziela specifically
|
||||
const niedziela = 'niedziela';
|
||||
const escaped = niedziela.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const regex = new RegExp(
|
||||
`(?:^|\\s|[,;:])${escaped}[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const match = text.match(regex);
|
||||
if (match) {
|
||||
console.log(`✓ niedziela matched!`);
|
||||
console.log(` Full match: "${match[0].substring(0, 100)}"`);
|
||||
console.log(` Captured text: "${match[1].substring(0, 100)}"`);
|
||||
console.log('');
|
||||
|
||||
// Test if times can be extracted from captured text
|
||||
const spacePattern = /\b(\d{1,2})\s+(\d{2})(?!\d)/g;
|
||||
const times = match[1].match(spacePattern);
|
||||
console.log(` Times in captured text: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ niedziela NOT matched`);
|
||||
console.log('');
|
||||
|
||||
// Try simpler regex
|
||||
const simpleRegex = /niedziela[:\s]+(.{0,100})/i;
|
||||
const simpleMatch = text.match(simpleRegex);
|
||||
if (simpleMatch) {
|
||||
console.log(`Simple regex matched: "${simpleMatch[1]}"`);
|
||||
}
|
||||
}
|
||||
|
||||
// Test poniedziałek
|
||||
console.log('\n=== Testing poniedziałek ===\n');
|
||||
|
||||
const ponieRegex = new RegExp(
|
||||
`(?:^|\\s|[,;:])poniedziałek[:\\s]+([^]*?)(?=${allDayNamesPattern}|$)`,
|
||||
'i'
|
||||
);
|
||||
|
||||
const ponieMatch = text.match(ponieRegex);
|
||||
if (ponieMatch) {
|
||||
console.log(`✓ poniedziałek matched!`);
|
||||
console.log(` Captured text: "${ponieMatch[1].substring(0, 100)}"`);
|
||||
|
||||
const times = ponieMatch[1].match(/\b(\d{1,2})\s+(\d{2})(?!\d)/g);
|
||||
console.log(` Times: ${times ? times.join(', ') : 'none'}`);
|
||||
} else {
|
||||
console.log(`✗ poniedziałek NOT matched`);
|
||||
}
|
||||
65
scripts/debug/test-polish-with-logging.ts
Normal file
65
scripts/debug/test-polish-with-logging.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test Polish church with detailed section logging
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
// Temporarily modify GenericScraper to add logging
|
||||
const originalParse = GenericScraper.prototype['parseSchedules'];
|
||||
GenericScraper.prototype['parseSchedules'] = function(html: string) {
|
||||
const text = html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Call findScheduleSections and log result
|
||||
const sections = this['findScheduleSections'](text);
|
||||
|
||||
console.log('\n=== Sections found by findScheduleSections() ===\n');
|
||||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
sections.forEach((section: any, i: number) => {
|
||||
console.log(`Section ${i + 1}: ${dayNames[section.day]} (day ${section.day})`);
|
||||
console.log(` Text: "${section.text.substring(0, 80)}..."`);
|
||||
});
|
||||
console.log(`\nTotal sections: ${sections.length}\n`);
|
||||
|
||||
// Continue with normal processing
|
||||
return originalParse.call(this, html);
|
||||
};
|
||||
|
||||
async function testPolish() {
|
||||
const url = 'http://parafialubojna.pl';
|
||||
console.log(`Testing: ${url}`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('PL');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`\nFinal result: ${result.success}`);
|
||||
console.log(`Schedules: ${result.schedules.length}\n`);
|
||||
|
||||
if (result.schedules.length > 0) {
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNamesPL = ['Niedziela', 'Poniedziałek', 'Wtorek', 'Środa', 'Czwartek', 'Piątek', 'Sobota'];
|
||||
console.log('Parsed schedules by day:');
|
||||
for (let i = 0; i < 7; i++) {
|
||||
if (byDay[i]) {
|
||||
console.log(` ${dayNamesPL[i]}: ${byDay[i].map(s => s.time).join(', ')}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
testPolish().catch(console.error);
|
||||
49
scripts/debug/test-time-extraction.ts
Normal file
49
scripts/debug/test-time-extraction.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test which pattern is matching "00" time
|
||||
*/
|
||||
|
||||
// Test text from German church
|
||||
const testText = "10:00 uhr lateinisches amt";
|
||||
|
||||
const timePatterns = [
|
||||
{ name: '12-hour AM/PM', pattern: /(\d{1,2}):(\d{2})\s*(AM|PM|am|pm|a\.m\.|p\.m\.)/g },
|
||||
{ name: '12-hour no minutes', pattern: /(?<![:\d])(\d{1,2})\s*(AM|PM|am|pm|a\.m\.|p\.m\.)/g },
|
||||
{ name: '24-hour colon', pattern: /(?<![:\d\w])(\d{1,2}):(\d{2})(?!\s*(AM|PM|am|pm))/g },
|
||||
{ name: 'French/Portuguese h', pattern: /(?<![:\d\w])(\d{1,2})\s*h\s*(\d{2})?(?!\w)/gi },
|
||||
{ name: 'Italian period', pattern: /(?<![:\d\w])(\d{1,2})\.(\d{2})(?=\s|$|,|;|\)|\])/g },
|
||||
{ name: 'German Uhr (old)', pattern: /(\d{1,2})[:\.]?(\d{2})?\s*Uhr/gi },
|
||||
{ name: 'German Uhr (fixed)', pattern: /(?<![:\d])(\d{1,2})[:\.]?(\d{2})?\s*Uhr/gi },
|
||||
{ name: 'Polish space', pattern: /\b(\d{1,2})\s+(\d{2})(?!\d)/g },
|
||||
];
|
||||
|
||||
console.log(`Test text: "${testText}"\n`);
|
||||
|
||||
for (const { name, pattern } of timePatterns) {
|
||||
const matches = [...testText.matchAll(pattern)];
|
||||
if (matches.length > 0) {
|
||||
console.log(`✓ ${name}:`);
|
||||
for (const match of matches) {
|
||||
console.log(` Matched: "${match[0]}" at index ${match.index}`);
|
||||
}
|
||||
} else {
|
||||
console.log(`✗ ${name}: no match`);
|
||||
}
|
||||
}
|
||||
|
||||
// Now test with just "00 uhr"
|
||||
console.log(`\n${'='.repeat(60)}\n`);
|
||||
const testText2 = "00 uhr lateinisches";
|
||||
console.log(`Test text: "${testText2}"\n`);
|
||||
|
||||
for (const { name, pattern } of timePatterns) {
|
||||
const matches = [...testText2.matchAll(pattern)];
|
||||
if (matches.length > 0) {
|
||||
console.log(`✓ ${name}:`);
|
||||
for (const match of matches) {
|
||||
console.log(` Matched: "${match[0]}" at index ${match.index}`);
|
||||
}
|
||||
} else {
|
||||
console.log(`✗ ${name}: no match`);
|
||||
}
|
||||
}
|
||||
193
scripts/debug/test-top5-countries.ts
Normal file
193
scripts/debug/test-top5-countries.ts
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Quick test of top 5 priority countries
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const COUNTRIES = [
|
||||
{ code: 'FR', name: 'France' },
|
||||
{ code: 'DE', name: 'Germany' },
|
||||
{ code: 'ES', name: 'Spain' },
|
||||
{ code: 'PL', name: 'Poland' },
|
||||
{ code: 'BR', name: 'Brazil' },
|
||||
];
|
||||
|
||||
const PER_COUNTRY = 10;
|
||||
|
||||
interface CountryResult {
|
||||
country: string;
|
||||
countryName: string;
|
||||
tested: number;
|
||||
success: number;
|
||||
failed: number;
|
||||
successRate: number;
|
||||
hasBothButFailed: number; // Has days + times but parsing failed
|
||||
totalSchedules: number;
|
||||
sampleSuccess?: string;
|
||||
}
|
||||
|
||||
async function testTop5() {
|
||||
console.log('Testing top 5 priority countries (10 churches each)...\n');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
const results: CountryResult[] = [];
|
||||
|
||||
for (const country of COUNTRIES) {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`Testing ${country.name} (${country.code})`);
|
||||
console.log('='.repeat(60));
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country: country.code,
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
},
|
||||
take: PER_COUNTRY,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (churches.length === 0) {
|
||||
console.log(`No churches with websites found for ${country.name}\n`);
|
||||
continue;
|
||||
}
|
||||
|
||||
scraper.setCountry(country.code);
|
||||
|
||||
let success = 0;
|
||||
let failed = 0;
|
||||
let hasBothButFailed = 0;
|
||||
let totalSchedules = 0;
|
||||
let sampleSuccess: string | undefined;
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(church.website!);
|
||||
|
||||
if (result.success && result.schedules.length > 0) {
|
||||
success++;
|
||||
totalSchedules += result.schedules.length;
|
||||
process.stdout.write(`✅ ${result.schedules.length} schedules\n`);
|
||||
|
||||
if (!sampleSuccess && result.schedules.length > 0) {
|
||||
sampleSuccess = `${church.name}: ${result.schedules.length} schedules`;
|
||||
}
|
||||
} else {
|
||||
failed++;
|
||||
process.stdout.write(`❌ ${result.error}\n`);
|
||||
|
||||
// Check if has both days and times (parsing bug indicator)
|
||||
if (result.rawHtml) {
|
||||
const text = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
|
||||
// Check for day names in any language
|
||||
const hasDays = text.match(/\b(sunday|monday|tuesday|wednesday|thursday|friday|saturday|dimanche|lundi|mardi|mercredi|jeudi|vendredi|samedi|sonntag|montag|dienstag|mittwoch|donnerstag|freitag|samstag|domingo|lunes|martes|miércoles|miercoles|jueves|viernes|sábado|sabado|niedziela|poniedziałek|poniedzialek|wtorek|środa|sroda|czwartek|piątek|piatek|sobota|segunda|terça|terca|quarta|quinta|sexta)\b/i);
|
||||
|
||||
const hasTimes = text.match(/\d{1,2}[h:\.]\s*\d{0,2}/);
|
||||
|
||||
if (hasDays && hasTimes) {
|
||||
hasBothButFailed++;
|
||||
process.stdout.write(` ⚠️ Has days + times but failed to parse\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
failed++;
|
||||
process.stdout.write(`❌ ERROR: ${err.message}\n`);
|
||||
}
|
||||
}
|
||||
|
||||
const successRate = churches.length > 0 ? (success / churches.length) * 100 : 0;
|
||||
|
||||
results.push({
|
||||
country: country.code,
|
||||
countryName: country.name,
|
||||
tested: churches.length,
|
||||
success,
|
||||
failed,
|
||||
successRate,
|
||||
hasBothButFailed,
|
||||
totalSchedules,
|
||||
sampleSuccess,
|
||||
});
|
||||
|
||||
console.log(`\n${country.name} Summary: ${success}/${churches.length} (${successRate.toFixed(0)}%)`);
|
||||
console.log(` Total schedules extracted: ${totalSchedules}`);
|
||||
if (hasBothButFailed > 0) {
|
||||
console.log(` ⚠️ Parsing bugs: ${hasBothButFailed} (has content but failed to parse)`);
|
||||
}
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
// Final summary
|
||||
console.log('\n\n');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('FINAL RESULTS - TOP 5 COUNTRIES');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('');
|
||||
console.log('Country | Tested | Success | Rate | Schedules | Bugs');
|
||||
console.log('─'.repeat(80));
|
||||
|
||||
const totalTested = results.reduce((sum, r) => sum + r.tested, 0);
|
||||
const totalSuccess = results.reduce((sum, r) => sum + r.success, 0);
|
||||
const totalSchedules = results.reduce((sum, r) => sum + r.totalSchedules, 0);
|
||||
const totalBugs = results.reduce((sum, r) => sum + r.hasBothButFailed, 0);
|
||||
|
||||
results.forEach(r => {
|
||||
const country = r.countryName.padEnd(12);
|
||||
const tested = String(r.tested).padStart(6);
|
||||
const success = String(r.success).padStart(7);
|
||||
const rate = `${r.successRate.toFixed(0)}%`.padStart(5);
|
||||
const schedules = String(r.totalSchedules).padStart(9);
|
||||
const bugs = r.hasBothButFailed > 0 ? `⚠️ ${r.hasBothButFailed}` : '✓';
|
||||
|
||||
console.log(`${country} | ${tested} | ${success} | ${rate} | ${schedules} | ${bugs}`);
|
||||
});
|
||||
|
||||
console.log('─'.repeat(80));
|
||||
const avgRate = totalTested > 0 ? (totalSuccess / totalTested) * 100 : 0;
|
||||
console.log(`OVERALL | ${String(totalTested).padStart(6)} | ${String(totalSuccess).padStart(7)} | ${avgRate.toFixed(0).padStart(4)}% | ${String(totalSchedules).padStart(9)} | ${totalBugs > 0 ? `⚠️ ${totalBugs}` : '✓'}`);
|
||||
console.log('');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('');
|
||||
|
||||
if (totalBugs > 0) {
|
||||
console.log(`⚠️ ${totalBugs} parsing bugs detected (has days + times but failed)`);
|
||||
console.log(' These need investigation and fixes.\n');
|
||||
} else {
|
||||
console.log('✅ No parsing bugs! All failures are legitimate (no content or wrong page).\n');
|
||||
}
|
||||
|
||||
console.log(`Total churches tested: ${totalTested}`);
|
||||
console.log(`Total successful: ${totalSuccess} (${avgRate.toFixed(1)}%)`);
|
||||
console.log(`Total mass schedules extracted: ${totalSchedules}`);
|
||||
console.log('');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
testTop5().catch(console.error);
|
||||
173
scripts/debug/test-website-scraper.ts
Normal file
173
scripts/debug/test-website-scraper.ts
Normal file
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Test website scraper on churches with websites
|
||||
* Analyzes which websites can be scraped successfully
|
||||
*/
|
||||
|
||||
// Load .env
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
import fs from 'fs';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
interface TestResult {
|
||||
churchId: string;
|
||||
name: string;
|
||||
website: string;
|
||||
country: string;
|
||||
success: boolean;
|
||||
massesFound: number;
|
||||
schedules?: { dayOfWeek: number; time: string; massType?: string; language?: string }[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function normalizeUrl(url: string): string {
|
||||
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||
return `https://${url}`;
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
async function testScrapers(limit: number = 50, country?: string) {
|
||||
const results: TestResult[] = [];
|
||||
|
||||
// Get churches with websites
|
||||
const whereClause: any = {
|
||||
website: { not: null },
|
||||
};
|
||||
|
||||
if (country) {
|
||||
whereClause.country = country;
|
||||
}
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: whereClause,
|
||||
take: limit,
|
||||
orderBy: { createdAt: 'desc' },
|
||||
});
|
||||
|
||||
console.log(`Testing ${churches.length} churches with websites...\n`);
|
||||
|
||||
// Initialize the scraper (launches Playwright browser)
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
try {
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
const url = normalizeUrl(church.website!);
|
||||
console.log(`[${i + 1}/${churches.length}] Testing: ${church.name}`);
|
||||
console.log(` Website: ${url}`);
|
||||
|
||||
try {
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
name: church.name,
|
||||
website: url,
|
||||
country: church.country,
|
||||
success: result.success,
|
||||
massesFound: result.schedules.length,
|
||||
schedules: result.schedules.map((s) => ({
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
massType: s.massType,
|
||||
language: s.language,
|
||||
})),
|
||||
error: result.error,
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
console.log(` ✓ ${result.schedules.length} masses found`);
|
||||
for (const s of result.schedules) {
|
||||
const days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||||
console.log(` ${days[s.dayOfWeek]} ${s.time} (${s.language || 'English'}${s.massType ? ', ' + s.massType : ''})`);
|
||||
}
|
||||
} else {
|
||||
console.log(` ✗ No masses found: ${result.error}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(` ✗ Error: ${error.message}`);
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
name: church.name,
|
||||
website: url,
|
||||
country: church.country,
|
||||
success: false,
|
||||
massesFound: 0,
|
||||
error: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
console.log('');
|
||||
}
|
||||
} finally {
|
||||
// Always close the browser
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
// Summary
|
||||
const successful = results.filter((r) => r.success);
|
||||
const failed = results.filter((r) => !r.success);
|
||||
const totalMasses = results.reduce((sum, r) => sum + r.massesFound, 0);
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Test Summary');
|
||||
console.log('============================================================');
|
||||
console.log(`Total churches tested: ${results.length}`);
|
||||
console.log(`Successful scrapes: ${successful.length} (${((successful.length / results.length) * 100).toFixed(1)}%)`);
|
||||
console.log(`Failed scrapes: ${failed.length} (${((failed.length / results.length) * 100).toFixed(1)}%)`);
|
||||
console.log(`Total masses found: ${totalMasses}`);
|
||||
console.log('============================================================');
|
||||
|
||||
if (failed.length > 0) {
|
||||
console.log('\nFailed websites:');
|
||||
for (const f of failed) {
|
||||
console.log(` - ${f.name}: ${f.website} (${f.error})`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('');
|
||||
|
||||
// Export results (without raw HTML to keep file manageable)
|
||||
fs.writeFileSync(
|
||||
'scraper-test-results.json',
|
||||
JSON.stringify(results, null, 2)
|
||||
);
|
||||
console.log('Results saved to scraper-test-results.json');
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const countryIndex = args.indexOf('--country');
|
||||
|
||||
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 50;
|
||||
const country = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Website Scraper Testing');
|
||||
console.log('============================================================');
|
||||
console.log(`Limit: ${limit}`);
|
||||
console.log(`Country: ${country || 'All'}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
await testScrapers(limit, country);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
53
scripts/debug/verify-paz-schedules.ts
Normal file
53
scripts/debug/verify-paz-schedules.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Verify Paróquia da Paz schedules are correctly parsed
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../../src/scrapers/strategies/generic';
|
||||
|
||||
async function verifyPazSchedules() {
|
||||
const url = 'https://www.paroquiadapaz.org.br/';
|
||||
console.log(`Verifying: ${url}\n`);
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
scraper.setCountry('BR');
|
||||
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
console.log(`✅ Success: ${result.success}`);
|
||||
console.log(`📅 Schedules found: ${result.schedules.length}\n`);
|
||||
|
||||
// Group by day
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const sched of result.schedules) {
|
||||
if (!byDay[sched.dayOfWeek]) byDay[sched.dayOfWeek] = [];
|
||||
byDay[sched.dayOfWeek].push(sched);
|
||||
}
|
||||
|
||||
const dayNames = ['Domingo', 'Segunda', 'Terça', 'Quarta', 'Quinta', 'Sexta', 'Sábado'];
|
||||
|
||||
console.log('═══════════════════════════════════════════════');
|
||||
console.log('PARSED SCHEDULE:');
|
||||
console.log('═══════════════════════════════════════════════\n');
|
||||
|
||||
Object.entries(byDay)
|
||||
.sort(([a], [b]) => parseInt(a) - parseInt(b))
|
||||
.forEach(([day, scheds]) => {
|
||||
console.log(`${dayNames[parseInt(day)]}:`);
|
||||
scheds.forEach(s => {
|
||||
console.log(` ${s.time} - ${s.language} ${s.massType}`);
|
||||
});
|
||||
console.log('');
|
||||
});
|
||||
|
||||
console.log('Expected schedule (from website):');
|
||||
console.log('Segunda, Terça, Quarta, Sexta: 16:00 e 18:00');
|
||||
console.log('Quinta: 16:00 e 19:00');
|
||||
console.log('Sábado: 08:00, 16:00 e 18:00');
|
||||
console.log('Domingo: 08:00, 11:00, 16:00, 18:00 e 20:00');
|
||||
|
||||
await scraper.close();
|
||||
}
|
||||
|
||||
verifyPazSchedules().catch(console.error);
|
||||
97
scripts/dedup-churches.ts
Normal file
97
scripts/dedup-churches.ts
Normal file
@@ -0,0 +1,97 @@
|
||||
/**
|
||||
* Find duplicate churches using ChromaDB semantic similarity.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/dedup-churches.ts # Dry run, show duplicates
|
||||
* npx tsx scripts/dedup-churches.ts --threshold 0.15 # Custom similarity threshold
|
||||
* npx tsx scripts/dedup-churches.ts --country US # Only check US churches
|
||||
* npx tsx scripts/dedup-churches.ts --limit 100 # Check first 100 churches
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { findSimilarChurches } from '../src/chromadb/queries';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const threshold = args.includes('--threshold')
|
||||
? parseFloat(args[args.indexOf('--threshold') + 1])
|
||||
: 0.15; // Cosine distance threshold (lower = more similar)
|
||||
const country = args.includes('--country')
|
||||
? args[args.indexOf('--country') + 1]
|
||||
: undefined;
|
||||
const limit = args.includes('--limit')
|
||||
? parseInt(args[args.indexOf('--limit') + 1])
|
||||
: 500;
|
||||
|
||||
async function main() {
|
||||
console.log(`Finding duplicate churches (threshold=${threshold}, country=${country || 'all'}, limit=${limit})`);
|
||||
console.log('---');
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
take: limit,
|
||||
where: country ? { country } : undefined,
|
||||
orderBy: { name: 'asc' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
address: true,
|
||||
city: true,
|
||||
country: true,
|
||||
source: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
_count: { select: { massSchedules: true } },
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`Checking ${churches.length} churches...\n`);
|
||||
|
||||
const seen = new Set<string>();
|
||||
let duplicateCount = 0;
|
||||
|
||||
for (const church of churches) {
|
||||
if (seen.has(church.id)) continue;
|
||||
|
||||
const text = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim();
|
||||
const similar = await findSimilarChurches(text, {
|
||||
country: church.country,
|
||||
nResults: 5,
|
||||
});
|
||||
|
||||
// Filter to matches within threshold, excluding self
|
||||
const matches = similar.filter(
|
||||
(s) => s.churchId !== church.id && s.distance <= threshold
|
||||
);
|
||||
|
||||
if (matches.length > 0) {
|
||||
duplicateCount++;
|
||||
console.log(`\nPotential duplicate #${duplicateCount}:`);
|
||||
console.log(` Original: "${church.name}" (${church.city || 'no city'}, ${church.country})`);
|
||||
console.log(` ID: ${church.id}, Source: ${church.source}, Schedules: ${church._count.massSchedules}`);
|
||||
console.log(` Lat/Lng: ${church.latitude}, ${church.longitude}`);
|
||||
|
||||
for (const match of matches) {
|
||||
console.log(` Match: "${match.document}" (distance: ${match.distance.toFixed(4)})`);
|
||||
console.log(` ID: ${match.churchId}`);
|
||||
seen.add(match.churchId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n---`);
|
||||
console.log(`Found ${duplicateCount} potential duplicate groups from ${churches.length} churches`);
|
||||
console.log(`Threshold: ${threshold} (lower = stricter matching)`);
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
72
scripts/dedup-mass-schedules.ts
Normal file
72
scripts/dedup-mass-schedules.ts
Normal file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env tsx
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
import { Pool } from 'pg';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
|
||||
interface CountResult {
|
||||
churches_with_dups: string;
|
||||
duplicate_rows: string;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const dryRun = !process.argv.includes('--execute');
|
||||
|
||||
if (dryRun) {
|
||||
console.log('DRY RUN - pass --execute to actually delete duplicates\n');
|
||||
}
|
||||
|
||||
const client = await pool.connect();
|
||||
|
||||
try {
|
||||
const countResult = await client.query<CountResult>(`
|
||||
WITH ranked AS (
|
||||
SELECT church_id,
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY church_id, day_of_week, time, language
|
||||
ORDER BY created_at ASC
|
||||
) AS rn
|
||||
FROM mass_schedules
|
||||
WHERE is_active = true
|
||||
)
|
||||
SELECT COUNT(DISTINCT church_id) AS churches_with_dups,
|
||||
COUNT(*) AS duplicate_rows
|
||||
FROM ranked
|
||||
WHERE rn > 1;
|
||||
`);
|
||||
|
||||
const { churches_with_dups, duplicate_rows } = countResult.rows[0];
|
||||
console.log(`Churches with duplicate schedules: ${churches_with_dups}`);
|
||||
console.log(`Duplicate rows to ${dryRun ? 'delete' : 'delete'}: ${duplicate_rows}\n`);
|
||||
|
||||
if (!dryRun && Number(duplicate_rows) > 0) {
|
||||
console.log('Deleting duplicates (keeping oldest by created_at)...');
|
||||
|
||||
const deleteResult = await client.query(`
|
||||
WITH ranked AS (
|
||||
SELECT id,
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY church_id, day_of_week, time, language
|
||||
ORDER BY created_at ASC
|
||||
) AS rn
|
||||
FROM mass_schedules
|
||||
WHERE is_active = true
|
||||
)
|
||||
DELETE FROM mass_schedules
|
||||
WHERE id IN (SELECT id FROM ranked WHERE rn > 1);
|
||||
`);
|
||||
|
||||
console.log(`Deleted ${deleteResult.rowCount} duplicate mass schedule rows.`);
|
||||
}
|
||||
} finally {
|
||||
client.release();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('Fatal error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
27
scripts/deploy-to-nas.sh
Executable file
27
scripts/deploy-to-nas.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
NAS_HOST="albert@192.168.0.145"
|
||||
NAS_PATH="/volume1/docker/scraper-control"
|
||||
LOCAL_PATH="/Users/albert/Documents/Projects/Church/ScraperControl"
|
||||
|
||||
echo "Deploying ScraperControl to NAS..."
|
||||
|
||||
rsync -avz \
|
||||
--exclude 'node_modules' \
|
||||
--exclude '.next' \
|
||||
--exclude '.git' \
|
||||
--exclude '.env.local' \
|
||||
--exclude '*.log' \
|
||||
"$LOCAL_PATH/" \
|
||||
"$NAS_HOST:$NAS_PATH/"
|
||||
|
||||
echo "Rebuilding containers..."
|
||||
ssh "$NAS_HOST" << 'ENDSSH'
|
||||
cd /volume1/docker/scraper-control
|
||||
/usr/local/bin/docker compose build app scraper scheduler
|
||||
/usr/local/bin/docker compose up -d scheduler freesearch-enrichment
|
||||
/usr/local/bin/docker compose ps
|
||||
/usr/local/bin/docker compose logs --tail 5 scheduler
|
||||
ENDSSH
|
||||
|
||||
echo "Deployment complete!"
|
||||
226
scripts/enrich-with-forward-geocode.ts
Normal file
226
scripts/enrich-with-forward-geocode.ts
Normal file
@@ -0,0 +1,226 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Enrich churches that have lat/lng=0 with real coordinates via Nominatim forward geocoding.
|
||||
* After this runs, enrich-with-reverse-geocode fills city/state from the new coordinates.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/enrich-with-forward-geocode.ts --country HK --dry-run
|
||||
* npx tsx scripts/enrich-with-forward-geocode.ts --country HK
|
||||
* npx tsx scripts/enrich-with-forward-geocode.ts --limit 10
|
||||
*
|
||||
* Rate limit: 1 request/second (Nominatim usage policy — mandatory).
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import axios from 'axios';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
const pool = new Pool({
|
||||
connectionString: dbUrl,
|
||||
ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
|
||||
});
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const NOMINATIM_SEARCH_URL = 'https://nominatim.openstreetmap.org/search';
|
||||
const RATE_LIMIT_MS = 1100;
|
||||
|
||||
// Some regions use a different ISO code in OSM than in our DB
|
||||
const NOMINATIM_COUNTRY_MAP: Record<string, string> = {
|
||||
HK: 'cn', // Hong Kong is part of China in OSM
|
||||
MO: 'cn', // Macau likewise
|
||||
};
|
||||
|
||||
interface ChurchRecord {
|
||||
id: string;
|
||||
name: string;
|
||||
address: string;
|
||||
country: string;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
}
|
||||
|
||||
interface NominatimSearchResult {
|
||||
lat: string;
|
||||
lon: string;
|
||||
display_name: string;
|
||||
address?: {
|
||||
city?: string;
|
||||
town?: string;
|
||||
village?: string;
|
||||
municipality?: string;
|
||||
state?: string;
|
||||
province?: string;
|
||||
};
|
||||
}
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function cleanAddress(address: string): string {
|
||||
return address
|
||||
// Strip trailing city/region suffixes
|
||||
.replace(/,?\s*(H\.K\.|HK|Hong Kong|Kowloon|Kln\.|New Territories|N\.T\.|Lantau Island)\.?\s*$/i, '')
|
||||
// Strip "R.E." (Religious Education suffix used in HK addresses)
|
||||
.replace(/,?\s*R\.E\./i, '')
|
||||
.replace(/\.$/, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback: strip any leading non-numeric institution name prefix and floor/unit designators,
|
||||
* returning just the street number onwards. Handles patterns like:
|
||||
* "Canossa School (H.K.) 8 Hoi Chak Street" → "8 Hoi Chak Street"
|
||||
* "G/F., Wai Ming Block, 111 Wing Hong Street" → "111 Wing Hong Street"
|
||||
* "3/F., Chi Wo Commercial Building, 20 Saigon Street" → "20 Saigon Street"
|
||||
*/
|
||||
function extractStreetAddress(address: string): string | null {
|
||||
// Find the first occurrence of a standalone number (house number)
|
||||
const match = address.match(/(?:^|,\s*)(\d+[A-Za-z]?(?:\s|,).*)/);
|
||||
if (!match) return null;
|
||||
const candidate = match[1].trim();
|
||||
// Must be meaningfully shorter than the full address to be worth retrying
|
||||
return candidate.length < address.length * 0.9 ? cleanAddress(candidate) : null;
|
||||
}
|
||||
|
||||
async function nominatimSearch(query: string, nominatimCountry: string): Promise<NominatimSearchResult | null> {
|
||||
const response = await axios.get(NOMINATIM_SEARCH_URL, {
|
||||
params: {
|
||||
q: query,
|
||||
format: 'json',
|
||||
limit: 1,
|
||||
countrycodes: nominatimCountry,
|
||||
addressdetails: 1,
|
||||
},
|
||||
headers: {
|
||||
'User-Agent': 'NearestMass/1.0 (privacy@nearestmass.com)',
|
||||
'Accept-Language': 'en',
|
||||
},
|
||||
timeout: 15000,
|
||||
});
|
||||
const results: NominatimSearchResult[] = response.data;
|
||||
return results.length > 0 ? results[0] : null;
|
||||
}
|
||||
|
||||
async function forwardGeocode(
|
||||
address: string,
|
||||
countryCode: string
|
||||
): Promise<{ result: NominatimSearchResult; usedFallback: boolean } | null> {
|
||||
const nominatimCountry = NOMINATIM_COUNTRY_MAP[countryCode] ?? countryCode.toLowerCase();
|
||||
const cleaned = cleanAddress(address);
|
||||
|
||||
const primary = await nominatimSearch(cleaned, nominatimCountry);
|
||||
if (primary) return { result: primary, usedFallback: false };
|
||||
|
||||
// Fallback: try just the street-number-onwards portion
|
||||
const streetOnly = extractStreetAddress(address);
|
||||
if (streetOnly && streetOnly !== cleaned) {
|
||||
await sleep(RATE_LIMIT_MS); // respect rate limit between retries
|
||||
const fallback = await nominatimSearch(streetOnly, nominatimCountry);
|
||||
if (fallback) return { result: fallback, usedFallback: true };
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const dryRun = args.includes('--dry-run');
|
||||
const countryIdx = args.indexOf('--country');
|
||||
const limitIdx = args.indexOf('--limit');
|
||||
const countryCode = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
|
||||
const limit = limitIdx !== -1 ? parseInt(args[limitIdx + 1], 10) : undefined;
|
||||
|
||||
log('============================================================');
|
||||
log('Nominatim Forward Geocode Enrichment');
|
||||
log('============================================================');
|
||||
log(`Country: ${countryCode || 'All'}`);
|
||||
log(`Limit: ${limit || 'No limit'}`);
|
||||
log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
|
||||
log('============================================================');
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
latitude: 0,
|
||||
longitude: 0,
|
||||
address: { not: null },
|
||||
...(countryCode ? { country: countryCode } : {}),
|
||||
},
|
||||
select: { id: true, name: true, address: true, country: true, city: true, state: true },
|
||||
orderBy: { createdAt: 'asc' },
|
||||
take: limit,
|
||||
}) as ChurchRecord[];
|
||||
|
||||
log(`Found ${churches.length} churches with lat/lng=0 and an address\n`);
|
||||
|
||||
const stats = { found: 0, notFound: 0, errors: 0 };
|
||||
|
||||
for (const church of churches) {
|
||||
try {
|
||||
const geocoded = await forwardGeocode(church.address, church.country);
|
||||
|
||||
if (!geocoded) {
|
||||
log(` - [NOT FOUND] ${church.name} | ${church.address}`);
|
||||
stats.notFound++;
|
||||
} else {
|
||||
const { result, usedFallback } = geocoded;
|
||||
const lat = parseFloat(result.lat);
|
||||
const lng = parseFloat(result.lon);
|
||||
const city = result.address?.city || result.address?.town ||
|
||||
result.address?.village || result.address?.municipality || null;
|
||||
const state = result.address?.state || result.address?.province || null;
|
||||
|
||||
log(` + [FOUND${usedFallback ? ' (fallback)' : ''}] ${church.name}`);
|
||||
log(` ${church.address}`);
|
||||
log(` → ${lat}, ${lng}${city ? ` (${city})` : ''}`);
|
||||
|
||||
if (!dryRun) {
|
||||
const updateData: Record<string, unknown> = { latitude: lat, longitude: lng };
|
||||
if (city && !church.city) updateData.city = city;
|
||||
if (state && !church.state) updateData.state = state;
|
||||
|
||||
await prisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: updateData,
|
||||
});
|
||||
}
|
||||
|
||||
stats.found++;
|
||||
}
|
||||
} catch (err: any) {
|
||||
log(` ! [ERROR] ${church.name}: ${err.message}`);
|
||||
stats.errors++;
|
||||
}
|
||||
|
||||
await sleep(RATE_LIMIT_MS);
|
||||
}
|
||||
|
||||
log('');
|
||||
log('============================================================');
|
||||
log('Forward Geocode Summary');
|
||||
log('============================================================');
|
||||
log(`Found coords: ${stats.found}`);
|
||||
log(`Not found: ${stats.notFound}`);
|
||||
log(`Errors: ${stats.errors}`);
|
||||
log('============================================================');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('Fatal error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
408
scripts/enrich-with-google-places.ts
Normal file
408
scripts/enrich-with-google-places.ts
Normal file
@@ -0,0 +1,408 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Enrich OSM churches with Google Places data (website, phone, email)
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/enrich-with-google-places.ts --limit 10 --dry-run
|
||||
* npx tsx scripts/enrich-with-google-places.ts --country BR --limit 100
|
||||
* npx tsx scripts/enrich-with-google-places.ts --all
|
||||
*
|
||||
* Rate Limiting:
|
||||
* - Free tier: $200/month credit
|
||||
* - Text Search: ~$17 per 1000 requests
|
||||
* - $200 / $17 = ~11,764 requests per month
|
||||
* - ~390 churches per day to stay within free tier
|
||||
* - Script uses 2-second delay between requests (max 1,800/hour)
|
||||
*/
|
||||
|
||||
// Load .env for database connection
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
// Use DATABASE_URL from .env (works for both local dev and NAS/production)
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import axios from 'axios';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const GOOGLE_PLACES_API_KEY = process.env.GOOGLE_PLACES_API_KEY;
|
||||
const PLACES_API_URL = 'https://places.googleapis.com/v1/places:searchText';
|
||||
const RATE_LIMIT_MS = 2000; // 2 seconds between requests
|
||||
|
||||
// --- Job Tracking ---
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function createNewJob(config: Record<string, unknown>): Promise<string> {
|
||||
const job = await prisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'google-enrichment',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config: config as any,
|
||||
},
|
||||
});
|
||||
return job.id;
|
||||
}
|
||||
|
||||
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { processed, succeeded, failed, itemsFound, totalItems },
|
||||
});
|
||||
}
|
||||
|
||||
async function checkJobStopping(jobId: string): Promise<boolean> {
|
||||
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
return job?.status === 'stopping';
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string, error?: string): Promise<void> {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Country priority order — largest OSM church counts first, since those
|
||||
* have the most un-enriched churches. Covers all countries from the
|
||||
* CATHOLIC_COUNTRIES lists in import-osm-churches.ts.
|
||||
*/
|
||||
const COUNTRY_PRIORITY = [
|
||||
// Top tier: 5000+ OSM churches
|
||||
'FR', 'IT', 'ES', 'DE', 'PL', 'BR',
|
||||
// High tier: 1000-5000
|
||||
'PT', 'AT', 'BE', 'CZ', 'PH', 'HU', 'US', 'MX', 'HR', 'GB',
|
||||
'CR', 'SK', 'EC', 'CH', 'AR', 'CA', 'CO', 'NL', 'IE', 'IN',
|
||||
'SI', 'AU',
|
||||
// Medium tier: 100-1000
|
||||
'PE', 'RO', 'KR', 'CL', 'ID', 'LT', 'BO', 'VN', 'BA', 'BY',
|
||||
'UA', 'VE', 'HN', 'UG', 'CD', 'GT', 'CU', 'SV', 'NI', 'PA',
|
||||
'DO', 'CN', 'JP', 'LV', 'RS', 'TZ', 'KE', 'AL', 'RU',
|
||||
// Lower tier: remaining countries
|
||||
'LU', 'MT', 'NZ', 'PG', 'FJ', 'NC', 'PF', 'UY', 'PY', 'HT',
|
||||
'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'AO', 'NG',
|
||||
'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL',
|
||||
'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO',
|
||||
'TL', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'LK', 'BD', 'PK',
|
||||
'LB', 'IL', 'PS', 'JO', 'SY', 'IQ',
|
||||
'GF', 'SR', 'GY', 'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC',
|
||||
'AG', 'DM', 'KN', 'MC', 'SM', 'VA', 'LI', 'AD',
|
||||
'RS', 'MK', 'EE', 'GE', 'AM',
|
||||
'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ',
|
||||
'DJ', 'GM', 'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG',
|
||||
'MN', 'BN', 'MV', 'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV',
|
||||
'FM', 'MH', 'PW',
|
||||
];
|
||||
|
||||
interface GooglePlacesResult {
|
||||
found: boolean;
|
||||
website?: string;
|
||||
phone?: string;
|
||||
placeId?: string;
|
||||
}
|
||||
|
||||
interface EnrichmentStats {
|
||||
processed: number;
|
||||
enriched: number;
|
||||
notFound: number;
|
||||
errors: number;
|
||||
websitesAdded: number;
|
||||
phonesAdded: number;
|
||||
}
|
||||
|
||||
async function searchGooglePlaces(
|
||||
name: string,
|
||||
city: string | null,
|
||||
state: string | null,
|
||||
latitude: number,
|
||||
longitude: number
|
||||
): Promise<GooglePlacesResult> {
|
||||
if (!GOOGLE_PLACES_API_KEY) {
|
||||
throw new Error('GOOGLE_PLACES_API_KEY not set in environment');
|
||||
}
|
||||
|
||||
// Build search query
|
||||
const location = [city, state].filter(Boolean).join(', ');
|
||||
const textQuery = `${name} ${location}`.trim();
|
||||
|
||||
try {
|
||||
const response = await axios.post(
|
||||
PLACES_API_URL,
|
||||
{
|
||||
textQuery,
|
||||
locationBias: {
|
||||
circle: {
|
||||
center: {
|
||||
latitude,
|
||||
longitude,
|
||||
},
|
||||
radius: 500, // 500 meters
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-Goog-Api-Key': GOOGLE_PLACES_API_KEY,
|
||||
'X-Goog-FieldMask': 'places.id,places.displayName,places.websiteUri,places.nationalPhoneNumber',
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
if (response.data.places && response.data.places.length > 0) {
|
||||
const place = response.data.places[0]; // Take first result
|
||||
return {
|
||||
found: true,
|
||||
website: place.websiteUri || undefined,
|
||||
phone: place.nationalPhoneNumber || undefined,
|
||||
placeId: place.id || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
return { found: false };
|
||||
} catch (error: any) {
|
||||
if (error.response?.status === 429) {
|
||||
console.error('Rate limited by Google Places API');
|
||||
throw new Error('RATE_LIMITED');
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function enrichChurches(
|
||||
countryCode?: string,
|
||||
limit?: number,
|
||||
dryRun: boolean = false,
|
||||
jobId?: string | null
|
||||
): Promise<EnrichmentStats> {
|
||||
const stats: EnrichmentStats = {
|
||||
processed: 0,
|
||||
enriched: 0,
|
||||
notFound: 0,
|
||||
errors: 0,
|
||||
websitesAdded: 0,
|
||||
phonesAdded: 0,
|
||||
};
|
||||
|
||||
let churches;
|
||||
|
||||
if (countryCode) {
|
||||
// Manual override: process specific country
|
||||
console.log(`Manual mode: Processing country ${countryCode}`);
|
||||
churches = await prisma.church.findMany({
|
||||
where: {
|
||||
source: 'osm',
|
||||
googleSearchedAt: null,
|
||||
country: countryCode,
|
||||
},
|
||||
take: limit,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
} else {
|
||||
// Priority mode: sequential through countries (exhaust each before moving on)
|
||||
console.log('Priority mode: Processing countries sequentially');
|
||||
console.log(`Top priority countries: ${COUNTRY_PRIORITY.slice(0, 10).join(', ')}...\n`);
|
||||
|
||||
churches = [];
|
||||
const targetTotal = limit || 390;
|
||||
|
||||
for (const country of COUNTRY_PRIORITY) {
|
||||
if (churches.length >= targetTotal) break;
|
||||
|
||||
const remaining = targetTotal - churches.length;
|
||||
const batch = await prisma.church.findMany({
|
||||
where: {
|
||||
source: 'osm',
|
||||
googleSearchedAt: null,
|
||||
country,
|
||||
},
|
||||
take: remaining,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
if (batch.length > 0) {
|
||||
churches.push(...batch);
|
||||
console.log(` Queued ${batch.length} churches from ${country}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nFound ${churches.length} churches to enrich`);
|
||||
console.log('');
|
||||
|
||||
for (const church of churches) {
|
||||
stats.processed++;
|
||||
|
||||
try {
|
||||
console.log(`[${stats.processed}/${churches.length}] ${church.name} (${church.city}, ${church.state})`);
|
||||
|
||||
const result = await searchGooglePlaces(
|
||||
church.name,
|
||||
church.city,
|
||||
church.state,
|
||||
church.latitude,
|
||||
church.longitude
|
||||
);
|
||||
|
||||
if (result.found) {
|
||||
console.log(' ✓ Found on Google Places');
|
||||
|
||||
if (result.website) {
|
||||
console.log(` Website: ${result.website}`);
|
||||
stats.websitesAdded++;
|
||||
}
|
||||
|
||||
if (result.phone) {
|
||||
console.log(` Phone: ${result.phone}`);
|
||||
stats.phonesAdded++;
|
||||
}
|
||||
|
||||
if (!dryRun) {
|
||||
await prisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: {
|
||||
website: result.website || church.website,
|
||||
phone: result.phone || church.phone,
|
||||
googlePlaceId: result.placeId || church.googlePlaceId,
|
||||
hasWebsite: !!(result.website || church.website),
|
||||
googleSearchedAt: new Date(),
|
||||
},
|
||||
});
|
||||
if (result.website || result.phone) {
|
||||
stats.enriched++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
console.log(' ✗ Not found on Google Places');
|
||||
stats.notFound++;
|
||||
|
||||
// Mark as attempted so we don't re-query this church
|
||||
if (!dryRun) {
|
||||
await prisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: { googleSearchedAt: new Date() },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Rate limiting
|
||||
await new Promise((resolve) => setTimeout(resolve, RATE_LIMIT_MS));
|
||||
} catch (error: any) {
|
||||
stats.errors++;
|
||||
if (error.message === 'RATE_LIMITED') {
|
||||
console.error(' ⚠ Rate limited, stopping enrichment');
|
||||
break;
|
||||
}
|
||||
console.error(` ✗ Error: ${error.message}`);
|
||||
}
|
||||
|
||||
// Job tracking: update progress every 10 items and check for stop
|
||||
if (jobId && stats.processed % 10 === 0) {
|
||||
await updateJobProgress(jobId, stats.processed, stats.enriched, stats.errors, stats.enriched, churches.length);
|
||||
const stopping = await checkJobStopping(jobId);
|
||||
if (stopping) {
|
||||
console.log('\nJob stop requested via admin dashboard.');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Progress update every 50 churches
|
||||
if (stats.processed % 50 === 0) {
|
||||
console.log('');
|
||||
console.log(`Progress: ${stats.processed}/${churches.length} processed`);
|
||||
console.log(` Enriched: ${stats.enriched}, Not found: ${stats.notFound}, Errors: ${stats.errors}`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
|
||||
// Final job update
|
||||
if (jobId) {
|
||||
await updateJobProgress(jobId, stats.processed, stats.enriched, stats.errors, stats.enriched, churches.length);
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const countryIndex = args.indexOf('--country');
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const dryRun = args.includes('--dry-run');
|
||||
const all = args.includes('--all');
|
||||
|
||||
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
||||
const limit = all ? undefined : limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 10;
|
||||
|
||||
if (!GOOGLE_PLACES_API_KEY) {
|
||||
console.error('Error: GOOGLE_PLACES_API_KEY not set in environment');
|
||||
console.error('Add it to your .env file');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Google Places Church Enrichment');
|
||||
console.log('============================================================');
|
||||
console.log(`Country: ${countryCode || 'All'}`);
|
||||
console.log(`Limit: ${limit || 'No limit'}`);
|
||||
console.log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
|
||||
console.log('============================================================');
|
||||
console.log('');
|
||||
|
||||
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId && !dryRun) {
|
||||
jobId = await createNewJob({ countryCode, limit, dryRun });
|
||||
}
|
||||
if (jobId) console.log(`Job ID: ${jobId}\n`);
|
||||
|
||||
const stats = await enrichChurches(countryCode, limit, dryRun, jobId);
|
||||
|
||||
console.log('');
|
||||
console.log('============================================================');
|
||||
console.log('Enrichment Summary');
|
||||
console.log('============================================================');
|
||||
console.log(`Churches processed: ${stats.processed}`);
|
||||
console.log(`Churches enriched: ${stats.enriched}`);
|
||||
console.log(`Not found on Google: ${stats.notFound}`);
|
||||
console.log(`Websites added: ${stats.websitesAdded}`);
|
||||
console.log(`Phone numbers added: ${stats.phonesAdded}`);
|
||||
console.log(`Errors encountered: ${stats.errors}`);
|
||||
console.log('============================================================');
|
||||
|
||||
// Complete job
|
||||
if (jobId) {
|
||||
await completeJob(jobId);
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
624
scripts/enrich-with-reverse-geocode.ts
Normal file
624
scripts/enrich-with-reverse-geocode.ts
Normal file
@@ -0,0 +1,624 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Enrich churches with city/state/zip via Nominatim reverse geocoding (OSM)
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/enrich-with-reverse-geocode.ts --country FR --limit 10 --dry-run
|
||||
* npx tsx scripts/enrich-with-reverse-geocode.ts --country FR --continuous
|
||||
* npx tsx scripts/enrich-with-reverse-geocode.ts --continuous
|
||||
*
|
||||
* Rate limit: 1 request/second (Nominatim usage policy — mandatory).
|
||||
* Full pass of ~193K churches in ~2 days.
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import axios from 'axios';
|
||||
|
||||
// Fresh DB connection (not cached singleton)
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const NOMINATIM_URL = 'https://nominatim.openstreetmap.org/reverse';
|
||||
const RATE_LIMIT_MS = 1100; // Slightly over 1s to stay safe
|
||||
const BATCH_SIZE = 50;
|
||||
const PROGRESS_INTERVAL = 10;
|
||||
|
||||
// --- Job Tracking ---
|
||||
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function createNewJob(config: Record<string, unknown>): Promise<string> {
|
||||
const job = await prisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'reverse-geocode-enrichment',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config,
|
||||
},
|
||||
});
|
||||
return job.id;
|
||||
}
|
||||
|
||||
async function updateJobProgress(jobId: string, stats: EnrichmentStats, totalItems: number): Promise<void> {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
processed: stats.processed,
|
||||
succeeded: stats.enriched,
|
||||
failed: stats.errors,
|
||||
itemsFound: stats.enriched,
|
||||
totalItems,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
async function checkJobStopping(jobId: string): Promise<boolean> {
|
||||
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
return job?.status === 'stopping';
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string, error?: string): Promise<void> {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
// --- Types ---
|
||||
|
||||
interface ChurchRecord {
|
||||
id: string;
|
||||
name: string;
|
||||
address: string | null;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
zip: string | null;
|
||||
country: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
}
|
||||
|
||||
interface NominatimAddress {
|
||||
house_number?: string;
|
||||
road?: string;
|
||||
city?: string;
|
||||
town?: string;
|
||||
village?: string;
|
||||
municipality?: string;
|
||||
hamlet?: string;
|
||||
suburb?: string;
|
||||
neighbourhood?: string;
|
||||
state?: string;
|
||||
province?: string;
|
||||
postcode?: string;
|
||||
country_code?: string;
|
||||
}
|
||||
|
||||
interface NominatimResponse {
|
||||
display_name?: string;
|
||||
address?: NominatimAddress;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
interface EnrichmentStats {
|
||||
processed: number;
|
||||
enriched: number;
|
||||
noCity: number;
|
||||
errors: number;
|
||||
skippedExisting: number;
|
||||
cycles: number;
|
||||
startTime: number;
|
||||
}
|
||||
|
||||
// --- Circuit Breaker ---
|
||||
|
||||
class CircuitBreaker {
|
||||
private failures = 0;
|
||||
private isOpen = false;
|
||||
private backoffMs = 60000; // Start at 60s for Nominatim
|
||||
private readonly maxBackoffMs = 300000; // 5 minutes
|
||||
private readonly threshold = 5;
|
||||
|
||||
async checkAndWait(): Promise<boolean> {
|
||||
if (!this.isOpen) return true;
|
||||
|
||||
log(`Circuit breaker open. Waiting ${Math.round(this.backoffMs / 1000)}s before retry...`);
|
||||
await sleep(this.backoffMs);
|
||||
|
||||
// Try a test request
|
||||
try {
|
||||
const resp = await axios.get(NOMINATIM_URL, {
|
||||
params: { lat: 48.8566, lon: 2.3522, format: 'json' },
|
||||
headers: { 'User-Agent': 'NearestMass/1.0 (privacy@nearestmass.com)' },
|
||||
timeout: 10000,
|
||||
});
|
||||
if (resp.status === 200) {
|
||||
this.reset();
|
||||
log('Circuit breaker closed: Nominatim is back');
|
||||
return true;
|
||||
}
|
||||
} catch {
|
||||
// Still down
|
||||
}
|
||||
|
||||
this.backoffMs = Math.min(this.backoffMs * 2, this.maxBackoffMs);
|
||||
return false;
|
||||
}
|
||||
|
||||
recordFailure() {
|
||||
this.failures++;
|
||||
if (this.failures >= this.threshold && !this.isOpen) {
|
||||
this.isOpen = true;
|
||||
this.backoffMs = 60000;
|
||||
log(`Circuit breaker OPEN after ${this.failures} consecutive failures`);
|
||||
}
|
||||
}
|
||||
|
||||
reset() {
|
||||
if (this.failures > 0 || this.isOpen) {
|
||||
this.failures = 0;
|
||||
this.isOpen = false;
|
||||
this.backoffMs = 60000;
|
||||
}
|
||||
}
|
||||
|
||||
get opened() { return this.isOpen; }
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
let shuttingDown = false;
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function logError(msg: string) {
|
||||
console.error(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => {
|
||||
const timer = setTimeout(resolve, ms);
|
||||
const check = setInterval(() => {
|
||||
if (shuttingDown) {
|
||||
clearTimeout(timer);
|
||||
clearInterval(check);
|
||||
resolve();
|
||||
}
|
||||
}, 1000);
|
||||
setTimeout(() => clearInterval(check), ms + 100);
|
||||
});
|
||||
}
|
||||
|
||||
// --- Nominatim API ---
|
||||
|
||||
async function reverseGeocode(lat: number, lng: number): Promise<NominatimResponse> {
|
||||
const response = await axios.get(NOMINATIM_URL, {
|
||||
params: {
|
||||
lat,
|
||||
lon: lng,
|
||||
format: 'json',
|
||||
zoom: 16,
|
||||
addressdetails: 1,
|
||||
},
|
||||
headers: {
|
||||
'User-Agent': 'NearestMass/1.0 (privacy@nearestmass.com)',
|
||||
'Accept-Language': 'en',
|
||||
},
|
||||
timeout: 15000,
|
||||
});
|
||||
return response.data;
|
||||
}
|
||||
|
||||
function extractCity(address: NominatimAddress): string | null {
|
||||
return address.city || address.town || address.village ||
|
||||
address.municipality || address.hamlet || null;
|
||||
}
|
||||
|
||||
function extractState(address: NominatimAddress): string | null {
|
||||
return address.state || address.province || null;
|
||||
}
|
||||
|
||||
function extractAddress(address: NominatimAddress): string | null {
|
||||
const parts: string[] = [];
|
||||
if (address.house_number) parts.push(address.house_number);
|
||||
if (address.road) parts.push(address.road);
|
||||
if (parts.length === 0) return null;
|
||||
return parts.join(' ');
|
||||
}
|
||||
|
||||
// --- Database Queries ---
|
||||
|
||||
async function getNextBatch(
|
||||
batchSize: number,
|
||||
countryCode?: string,
|
||||
): Promise<ChurchRecord[]> {
|
||||
return prisma.church.findMany({
|
||||
where: {
|
||||
city: null,
|
||||
latitude: { not: undefined },
|
||||
longitude: { not: undefined },
|
||||
reverseGeocodedAt: null,
|
||||
...(countryCode ? { country: countryCode } : {}),
|
||||
},
|
||||
select: {
|
||||
id: true, name: true, address: true, city: true, state: true, zip: true,
|
||||
country: true, latitude: true, longitude: true,
|
||||
},
|
||||
take: batchSize,
|
||||
orderBy: [
|
||||
{ country: 'asc' },
|
||||
{ createdAt: 'asc' },
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
async function getTotalRemaining(countryCode?: string): Promise<number> {
|
||||
return prisma.church.count({
|
||||
where: {
|
||||
city: null,
|
||||
latitude: { not: undefined },
|
||||
longitude: { not: undefined },
|
||||
reverseGeocodedAt: null,
|
||||
...(countryCode ? { country: countryCode } : {}),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
// --- Main Processing ---
|
||||
|
||||
async function processChurch(
|
||||
church: ChurchRecord,
|
||||
stats: EnrichmentStats,
|
||||
dryRun: boolean,
|
||||
): Promise<void> {
|
||||
const label = `${church.name} (${church.country})`;
|
||||
|
||||
try {
|
||||
const result = await reverseGeocode(church.latitude, church.longitude);
|
||||
|
||||
if (result.error || !result.address) {
|
||||
log(` - [${stats.processed}] ${label} => no address data`);
|
||||
stats.noCity++;
|
||||
if (!dryRun) {
|
||||
await prisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: { reverseGeocodedAt: new Date() },
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const address = extractAddress(result.address);
|
||||
const city = extractCity(result.address);
|
||||
const state = extractState(result.address);
|
||||
const zip = result.address.postcode || null;
|
||||
|
||||
if (city) {
|
||||
const addrStr = address ? `${address}, ` : '';
|
||||
log(` + [${stats.processed}] ${label} => ${addrStr}${city}, ${state || '?'}`);
|
||||
stats.enriched++;
|
||||
} else {
|
||||
log(` - [${stats.processed}] ${label} => no city in response`);
|
||||
stats.noCity++;
|
||||
}
|
||||
|
||||
if (!dryRun) {
|
||||
const updateData: Record<string, unknown> = {
|
||||
reverseGeocodedAt: new Date(),
|
||||
};
|
||||
// Only update fields that are currently null
|
||||
if (address && !church.address) updateData.address = address;
|
||||
if (city && !church.city) updateData.city = city;
|
||||
if (state && !church.state) updateData.state = state;
|
||||
if (zip && !church.zip) updateData.zip = zip;
|
||||
// Update country if currently unknown (XX) and Nominatim returned one
|
||||
const countryCodeResult = result.address.country_code?.toUpperCase();
|
||||
if (church.country === 'XX' && countryCodeResult && countryCodeResult !== 'XX') {
|
||||
updateData.country = countryCodeResult;
|
||||
}
|
||||
|
||||
await prisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: updateData,
|
||||
});
|
||||
}
|
||||
} catch (error: any) {
|
||||
stats.errors++;
|
||||
|
||||
// Handle rate limiting (429)
|
||||
if (error.response?.status === 429) {
|
||||
logError(` ! [${stats.processed}] ${label} => rate limited (429), backing off...`);
|
||||
await sleep(5000); // Extra 5s backoff
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Handle server errors (5xx)
|
||||
if (error.response?.status >= 500) {
|
||||
logError(` ! [${stats.processed}] ${label} => server error (${error.response.status})`);
|
||||
throw error;
|
||||
}
|
||||
|
||||
logError(` ! [${stats.processed}] ${label} => ${error.message}`);
|
||||
// Don't throw for non-retriable errors (just mark as attempted)
|
||||
if (!dryRun) {
|
||||
await prisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: { reverseGeocodedAt: new Date() },
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function runSinglePass(
|
||||
stats: EnrichmentStats,
|
||||
countryCode?: string,
|
||||
limit?: number,
|
||||
dryRun: boolean = false,
|
||||
jobId?: string | null,
|
||||
): Promise<void> {
|
||||
let totalProcessed = 0;
|
||||
const circuitBreaker = new CircuitBreaker();
|
||||
|
||||
while (!shuttingDown) {
|
||||
if (limit && totalProcessed >= limit) break;
|
||||
|
||||
// Circuit breaker check
|
||||
if (circuitBreaker.opened) {
|
||||
const ok = await circuitBreaker.checkAndWait();
|
||||
if (!ok) continue;
|
||||
}
|
||||
|
||||
const batchLimit = limit
|
||||
? Math.min(BATCH_SIZE, limit - totalProcessed)
|
||||
: BATCH_SIZE;
|
||||
|
||||
const churches = await getNextBatch(batchLimit, countryCode);
|
||||
if (churches.length === 0) break;
|
||||
|
||||
for (const church of churches) {
|
||||
if (shuttingDown) break;
|
||||
if (limit && totalProcessed >= limit) break;
|
||||
|
||||
stats.processed++;
|
||||
totalProcessed++;
|
||||
|
||||
try {
|
||||
await processChurch(church, stats, dryRun);
|
||||
circuitBreaker.reset();
|
||||
} catch (error: any) {
|
||||
circuitBreaker.recordFailure();
|
||||
// Already logged in processChurch
|
||||
}
|
||||
|
||||
// Rate limit: 1 request per second
|
||||
if (!shuttingDown) {
|
||||
await sleep(RATE_LIMIT_MS);
|
||||
}
|
||||
|
||||
// Job tracking: update progress every PROGRESS_INTERVAL items
|
||||
if (jobId && stats.processed % PROGRESS_INTERVAL === 0) {
|
||||
await updateJobProgress(jobId, stats, 0);
|
||||
const stopping = await checkJobStopping(jobId);
|
||||
if (stopping) {
|
||||
log('Job stop requested via admin dashboard.');
|
||||
shuttingDown = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Progress logging
|
||||
if (stats.processed % 100 === 0) {
|
||||
const elapsed = (Date.now() - stats.startTime) / 1000;
|
||||
const rate = Math.round((stats.processed / elapsed) * 3600);
|
||||
const enrichRate = stats.processed > 0
|
||||
? ((stats.enriched / stats.processed) * 100).toFixed(1)
|
||||
: '0.0';
|
||||
log(`Progress: ${stats.processed} processed, ${stats.enriched} enriched, ${stats.noCity} no-city, ${stats.errors} errors`);
|
||||
log(` Enrich rate: ${enrichRate}%, Rate: ~${rate}/hour`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function runContinuous(
|
||||
stats: EnrichmentStats,
|
||||
countryCode?: string,
|
||||
jobId?: string | null,
|
||||
): Promise<void> {
|
||||
log('Running in continuous mode. Press Ctrl+C to stop.');
|
||||
const circuitBreaker = new CircuitBreaker();
|
||||
|
||||
while (!shuttingDown) {
|
||||
stats.cycles++;
|
||||
log(`--- Cycle ${stats.cycles} ---`);
|
||||
let processedInCycle = 0;
|
||||
|
||||
while (!shuttingDown) {
|
||||
// Circuit breaker check
|
||||
if (circuitBreaker.opened) {
|
||||
const ok = await circuitBreaker.checkAndWait();
|
||||
if (!ok) continue;
|
||||
}
|
||||
|
||||
const churches = await getNextBatch(BATCH_SIZE, countryCode);
|
||||
if (churches.length === 0) break;
|
||||
|
||||
for (const church of churches) {
|
||||
if (shuttingDown) break;
|
||||
|
||||
stats.processed++;
|
||||
processedInCycle++;
|
||||
|
||||
try {
|
||||
await processChurch(church, stats, false);
|
||||
circuitBreaker.reset();
|
||||
} catch {
|
||||
circuitBreaker.recordFailure();
|
||||
}
|
||||
|
||||
// Rate limit
|
||||
if (!shuttingDown) {
|
||||
await sleep(RATE_LIMIT_MS);
|
||||
}
|
||||
|
||||
// Job tracking
|
||||
if (jobId && stats.processed % PROGRESS_INTERVAL === 0) {
|
||||
await updateJobProgress(jobId, stats, 0);
|
||||
const stopping = await checkJobStopping(jobId);
|
||||
if (stopping) {
|
||||
log('Job stop requested via admin dashboard.');
|
||||
shuttingDown = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Progress logging
|
||||
if (stats.processed % 100 === 0) {
|
||||
const elapsed = (Date.now() - stats.startTime) / 1000;
|
||||
const rate = Math.round((stats.processed / elapsed) * 3600);
|
||||
log(`Progress: ${stats.processed} processed, ${stats.enriched} enriched, ${stats.noCity} no-city, ${stats.errors} errors (~${rate}/hour)`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (shuttingDown) break;
|
||||
|
||||
if (processedInCycle === 0) {
|
||||
log('No churches needing reverse geocoding. Waiting 1 hour...');
|
||||
for (let i = 0; i < 360 && !shuttingDown; i++) {
|
||||
await sleep(10000);
|
||||
}
|
||||
} else {
|
||||
log(`Cycle ${stats.cycles} complete. ${processedInCycle} churches processed. Brief pause...`);
|
||||
await sleep(10000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Main ---
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const countryIndex = args.indexOf('--country');
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const dryRun = args.includes('--dry-run');
|
||||
const continuous = args.includes('--continuous');
|
||||
|
||||
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
||||
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : undefined;
|
||||
|
||||
// Graceful shutdown
|
||||
process.on('SIGTERM', () => {
|
||||
log('Received SIGTERM, finishing current request...');
|
||||
shuttingDown = true;
|
||||
});
|
||||
process.on('SIGINT', () => {
|
||||
log('Received SIGINT, finishing current request...');
|
||||
shuttingDown = true;
|
||||
});
|
||||
|
||||
log('============================================================');
|
||||
log('Nominatim Reverse Geocode Enrichment');
|
||||
log('============================================================');
|
||||
log(`Mode: ${continuous ? 'Continuous' : 'Single pass'}`);
|
||||
log(`Country: ${countryCode || 'All'}`);
|
||||
log(`Limit: ${limit || 'No limit'}`);
|
||||
log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
|
||||
log(`Rate limit: ${RATE_LIMIT_MS}ms between requests`);
|
||||
log('============================================================');
|
||||
|
||||
// Count remaining
|
||||
const remaining = await getTotalRemaining(countryCode);
|
||||
log(`Churches needing reverse geocoding: ${remaining}`);
|
||||
const estimatedHours = (remaining * RATE_LIMIT_MS / 1000 / 3600).toFixed(1);
|
||||
log(`Estimated time: ~${estimatedHours} hours @ 1 req/sec`);
|
||||
|
||||
if (remaining === 0) {
|
||||
log('Nothing to do!');
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob({ countryCode, limit, continuous, dryRun });
|
||||
}
|
||||
log(`Job ID: ${jobId}`);
|
||||
|
||||
const stats: EnrichmentStats = {
|
||||
processed: 0,
|
||||
enriched: 0,
|
||||
noCity: 0,
|
||||
errors: 0,
|
||||
skippedExisting: 0,
|
||||
cycles: 0,
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
if (continuous) {
|
||||
await runContinuous(stats, countryCode, jobId);
|
||||
} else {
|
||||
await runSinglePass(stats, countryCode, limit, dryRun, jobId);
|
||||
}
|
||||
|
||||
// Complete job
|
||||
if (jobId) {
|
||||
await updateJobProgress(jobId, stats, 0);
|
||||
await completeJob(jobId);
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1);
|
||||
const enrichRate = stats.processed > 0
|
||||
? ((stats.enriched / stats.processed) * 100).toFixed(1)
|
||||
: '0.0';
|
||||
|
||||
log('');
|
||||
log('============================================================');
|
||||
log('Reverse Geocode Enrichment Summary');
|
||||
log('============================================================');
|
||||
log(`Churches processed: ${stats.processed}`);
|
||||
log(`Cities found: ${stats.enriched}`);
|
||||
log(`No city in response: ${stats.noCity}`);
|
||||
log(`Errors: ${stats.errors}`);
|
||||
log(`Enrich rate: ${enrichRate}%`);
|
||||
log(`Elapsed: ${elapsed}s`);
|
||||
if (stats.cycles > 0) {
|
||||
log(`Cycles completed: ${stats.cycles}`);
|
||||
}
|
||||
log('============================================================');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
328
scripts/enrich-with-wikidata.ts
Normal file
328
scripts/enrich-with-wikidata.ts
Normal file
@@ -0,0 +1,328 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Enrich churches with website URLs from Wikidata
|
||||
*
|
||||
* Queries Wikidata SPARQL endpoint for Catholic churches that have official websites,
|
||||
* then matches them to existing churches in the database via proximity + name matching.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/enrich-with-wikidata.ts --dry-run
|
||||
* npx tsx scripts/enrich-with-wikidata.ts --execute
|
||||
* npx tsx scripts/enrich-with-wikidata.ts --execute --country DE
|
||||
* npx tsx scripts/enrich-with-wikidata.ts --job-id <uuid>
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import axios from 'axios';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql';
|
||||
const MATCH_RADIUS_KM = 1.0; // Max distance for matching
|
||||
const BATCH_SIZE = 500; // SPARQL results per query
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function logError(msg: string) {
|
||||
console.error(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
// Haversine distance in km
|
||||
function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number {
|
||||
const R = 6371;
|
||||
const dLat = (lat2 - lat1) * Math.PI / 180;
|
||||
const dLon = (lon2 - lon1) * Math.PI / 180;
|
||||
const a = Math.sin(dLat / 2) ** 2 +
|
||||
Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) *
|
||||
Math.sin(dLon / 2) ** 2;
|
||||
return R * 2 * Math.asin(Math.sqrt(a));
|
||||
}
|
||||
|
||||
function normalizeForMatch(str: string): string {
|
||||
return str.toLowerCase()
|
||||
.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip accents
|
||||
.replace(/[^a-z0-9\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
interface WikidataChurch {
|
||||
label: string;
|
||||
website: string;
|
||||
lat: number;
|
||||
lon: number;
|
||||
wikidataId: string;
|
||||
}
|
||||
|
||||
async function queryWikidata(country?: string, offset = 0): Promise<WikidataChurch[]> {
|
||||
// SPARQL query for Catholic churches with websites
|
||||
let countryFilter = '';
|
||||
if (country) {
|
||||
// Map ISO alpha-2 to Wikidata country item
|
||||
const countryMap: Record<string, string> = {
|
||||
DE: 'Q183', FR: 'Q142', ES: 'Q29', IT: 'Q38', PL: 'Q36',
|
||||
PT: 'Q45', BR: 'Q155', NL: 'Q55', CZ: 'Q213', HU: 'Q28',
|
||||
AT: 'Q40', BE: 'Q31', CH: 'Q39', IE: 'Q27', GB: 'Q145',
|
||||
US: 'Q30', CA: 'Q16', MX: 'Q96', AR: 'Q414', CO: 'Q739',
|
||||
HR: 'Q224', SK: 'Q214', SI: 'Q215',
|
||||
};
|
||||
const qid = countryMap[country];
|
||||
if (qid) {
|
||||
countryFilter = `?church wdt:P17 wd:${qid} .`;
|
||||
}
|
||||
}
|
||||
|
||||
const sparql = `
|
||||
SELECT ?church ?churchLabel ?website ?lat ?lon WHERE {
|
||||
?church wdt:P31/wdt:P279* wd:Q16970 .
|
||||
?church wdt:P140 wd:Q9592 .
|
||||
?church wdt:P856 ?website .
|
||||
?church p:P625 ?coordStatement .
|
||||
?coordStatement ps:P625 ?coord .
|
||||
BIND(geof:latitude(?coord) AS ?lat)
|
||||
BIND(geof:longitude(?coord) AS ?lon)
|
||||
${countryFilter}
|
||||
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,de,fr,es,it,pt,pl,nl,cs,hu" . }
|
||||
}
|
||||
ORDER BY ?church
|
||||
LIMIT ${BATCH_SIZE}
|
||||
OFFSET ${offset}
|
||||
`;
|
||||
|
||||
const response = await axios.get(WIKIDATA_SPARQL_URL, {
|
||||
params: { query: sparql, format: 'json' },
|
||||
headers: {
|
||||
'User-Agent': 'NearestMass/1.0 (https://nearestmass.com; contact: privacy@nearestmass.com)',
|
||||
'Accept': 'application/sparql-results+json',
|
||||
},
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
const bindings = response.data?.results?.bindings || [];
|
||||
return bindings.map((b: any) => ({
|
||||
label: b.churchLabel?.value || '',
|
||||
website: b.website?.value || '',
|
||||
lat: parseFloat(b.lat?.value || '0'),
|
||||
lon: parseFloat(b.lon?.value || '0'),
|
||||
wikidataId: b.church?.value?.replace('http://www.wikidata.org/entity/', '') || '',
|
||||
}));
|
||||
}
|
||||
|
||||
interface MatchResult {
|
||||
churchId: string;
|
||||
churchName: string;
|
||||
distance: number;
|
||||
nameScore: number;
|
||||
}
|
||||
|
||||
async function findMatch(wdChurch: WikidataChurch): Promise<MatchResult | null> {
|
||||
// Find nearby churches without a website
|
||||
const nearby = await prisma.church.findMany({
|
||||
where: {
|
||||
website: null,
|
||||
latitude: { gte: wdChurch.lat - 0.01, lte: wdChurch.lat + 0.01 },
|
||||
longitude: { gte: wdChurch.lon - 0.01, lte: wdChurch.lon + 0.01 },
|
||||
},
|
||||
select: { id: true, name: true, latitude: true, longitude: true },
|
||||
take: 20,
|
||||
});
|
||||
|
||||
if (nearby.length === 0) return null;
|
||||
|
||||
// Score each candidate
|
||||
const wdNameNorm = normalizeForMatch(wdChurch.label);
|
||||
const wdWords = wdNameNorm.split(' ').filter(w => w.length >= 3);
|
||||
|
||||
let bestMatch: MatchResult | null = null;
|
||||
|
||||
for (const church of nearby) {
|
||||
const dist = haversineKm(wdChurch.lat, wdChurch.lon, church.latitude, church.longitude);
|
||||
if (dist > MATCH_RADIUS_KM) continue;
|
||||
|
||||
const churchNameNorm = normalizeForMatch(church.name);
|
||||
const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3);
|
||||
|
||||
// Count matching words
|
||||
let matchingWords = 0;
|
||||
for (const w of wdWords) {
|
||||
if (churchWords.includes(w)) matchingWords++;
|
||||
}
|
||||
|
||||
const nameScore = wdWords.length > 0 ? matchingWords / wdWords.length : 0;
|
||||
|
||||
// Require at least 50% word overlap or distance < 100m
|
||||
if (nameScore < 0.5 && dist > 0.1) continue;
|
||||
|
||||
if (!bestMatch || nameScore > bestMatch.nameScore ||
|
||||
(nameScore === bestMatch.nameScore && dist < bestMatch.distance)) {
|
||||
bestMatch = {
|
||||
churchId: church.id,
|
||||
churchName: church.name,
|
||||
distance: dist,
|
||||
nameScore,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
// --- Job Tracking ---
|
||||
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const dryRun = !args.includes('--execute');
|
||||
const countryIdx = args.indexOf('--country');
|
||||
const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
|
||||
|
||||
log('============================================================');
|
||||
log('Wikidata Church Website Enrichment');
|
||||
log('============================================================');
|
||||
log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`);
|
||||
log(`Country: ${country || 'All'}`);
|
||||
log('============================================================');
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId && !dryRun) {
|
||||
const job = await prisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'wikidata-enrichment',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config: { country, dryRun },
|
||||
},
|
||||
});
|
||||
jobId = job.id;
|
||||
log(`Job ID: ${jobId}`);
|
||||
}
|
||||
|
||||
let totalFetched = 0;
|
||||
let matched = 0;
|
||||
let updated = 0;
|
||||
let noMatch = 0;
|
||||
let alreadyHasWebsite = 0;
|
||||
let offset = 0;
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
log(`Querying Wikidata (offset ${offset})...`);
|
||||
const results = await queryWikidata(country, offset);
|
||||
|
||||
if (results.length === 0) {
|
||||
log('No more results from Wikidata.');
|
||||
break;
|
||||
}
|
||||
|
||||
totalFetched += results.length;
|
||||
log(`Fetched ${results.length} churches from Wikidata (total: ${totalFetched})`);
|
||||
|
||||
for (const wdChurch of results) {
|
||||
if (!wdChurch.website || !wdChurch.lat || !wdChurch.lon) continue;
|
||||
|
||||
const match = await findMatch(wdChurch);
|
||||
|
||||
if (!match) {
|
||||
noMatch++;
|
||||
continue;
|
||||
}
|
||||
|
||||
matched++;
|
||||
log(` Match: "${wdChurch.label}" (${wdChurch.wikidataId}) -> "${match.churchName}" (dist: ${match.distance.toFixed(3)}km, score: ${match.nameScore.toFixed(2)})`);
|
||||
|
||||
if (!dryRun) {
|
||||
await prisma.church.update({
|
||||
where: { id: match.churchId },
|
||||
data: {
|
||||
website: wdChurch.website,
|
||||
hasWebsite: true,
|
||||
},
|
||||
});
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
|
||||
// Rate limit SPARQL queries
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
offset += BATCH_SIZE;
|
||||
|
||||
// Update job progress
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
processed: totalFetched,
|
||||
succeeded: updated,
|
||||
itemsFound: matched,
|
||||
},
|
||||
});
|
||||
|
||||
// Check for stop
|
||||
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
if (job?.status === 'stopping') {
|
||||
log('Job stop requested.');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
logError(`Error: ${error.message}`);
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'failed', error: error.message, completedAt: new Date() },
|
||||
});
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Complete job
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'completed', completedAt: new Date(), processed: totalFetched, succeeded: updated, itemsFound: matched },
|
||||
});
|
||||
}
|
||||
|
||||
log('');
|
||||
log('============================================================');
|
||||
log('Wikidata Enrichment Summary');
|
||||
log('============================================================');
|
||||
log(`Wikidata churches fetched: ${totalFetched}`);
|
||||
log(`Matched to DB churches: ${matched}`);
|
||||
log(`Websites updated: ${updated}`);
|
||||
log(`No match found: ${noMatch}`);
|
||||
log(`Already had website: ${alreadyHasWebsite}`);
|
||||
log('============================================================');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -491,10 +491,14 @@ async function processSchedulesForDiocese(
|
||||
})),
|
||||
});
|
||||
|
||||
// Mark church as scraped
|
||||
// Update church metadata from detail (pastor, phone) if available
|
||||
const churchUpdateData: Record<string, unknown> = { lastScrapedAt: new Date() };
|
||||
if (detail.parroco) churchUpdateData.pastorName = detail.parroco;
|
||||
if (detail.telefono) churchUpdateData.phone = detail.telefono;
|
||||
|
||||
await tx.church.update({
|
||||
where: { id: dbId },
|
||||
data: { lastScrapedAt: new Date() },
|
||||
data: churchUpdateData,
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
623
scripts/match-search-results.ts
Normal file
623
scripts/match-search-results.ts
Normal file
@@ -0,0 +1,623 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Second-pass matching: analyze stored ChromaDB search results to find websites
|
||||
* that the FreeSearch first pass missed.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/match-search-results.ts --dry-run
|
||||
* npx tsx scripts/match-search-results.ts --country IT --limit 100
|
||||
* npx tsx scripts/match-search-results.ts --threshold 0.3
|
||||
*
|
||||
* Algorithm:
|
||||
* 1. Get churches without websites that have been FreeSearch'd
|
||||
* 2. Query ChromaDB search_results collection for semantically similar results
|
||||
* 3. Cross-church matching: URLs from nearby churches may match
|
||||
* 4. URL frequency analysis: URLs appearing for multiple churches in same area
|
||||
* 5. Verify best candidates against page content
|
||||
* 6. Update church.website if verified
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { Collection } from 'chromadb';
|
||||
import axios from 'axios';
|
||||
import { getCollection, COLLECTION_NAMES } from '../src/chromadb/collections';
|
||||
import { embedSingle } from '../src/chromadb/embeddings';
|
||||
|
||||
// Fresh DB connection
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
// --- Job Tracking ---
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function createNewJob(config: Record<string, unknown>): Promise<string> {
|
||||
const job = await prisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'match-search-results',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config,
|
||||
},
|
||||
});
|
||||
return job.id;
|
||||
}
|
||||
|
||||
async function updateJobProgress(jobId: string, processed: number, found: number, total: number): Promise<void> {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { processed, succeeded: found, totalItems: total },
|
||||
});
|
||||
}
|
||||
|
||||
async function checkJobStopping(jobId: string): Promise<boolean> {
|
||||
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
return job?.status === 'stopping';
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string, error?: string): Promise<void> {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
// --- Types ---
|
||||
|
||||
interface ChurchRecord {
|
||||
id: string;
|
||||
name: string;
|
||||
address: string | null;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
country: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
}
|
||||
|
||||
interface MatchStats {
|
||||
processed: number;
|
||||
matched: number;
|
||||
noResults: number;
|
||||
verifyFailed: number;
|
||||
errors: number;
|
||||
startTime: number;
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
let shuttingDown = false;
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function logError(msg: string) {
|
||||
console.error(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function normalizeForMatch(str: string): string {
|
||||
return str.toLowerCase()
|
||||
.replace(/[^a-z0-9\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
const CATHOLIC_KEYWORDS = [
|
||||
'parish', 'church', 'catholic', 'parroquia', 'paroisse', 'pfarrei',
|
||||
'parafia', 'paroquia', 'parrocchia', 'farnost', 'plebania', 'parochie',
|
||||
'župnija', 'farnosť', 'iglesia', 'église', 'kirche', 'kościół',
|
||||
'chiesa', 'kostel', 'templom', 'kerk',
|
||||
];
|
||||
|
||||
const MASS_SCHEDULE_KEYWORDS = [
|
||||
'mass schedule', 'mass times', 'worship schedule', 'worship times',
|
||||
'service times', 'sunday mass', 'weekday mass',
|
||||
'horario de misas', 'horarios de misa', 'horaires des messes',
|
||||
'gottesdienst', 'gottesdienstzeiten', 'messzeiten',
|
||||
'msze święte', 'godziny mszy', 'msze św',
|
||||
'orari delle messe', 'orario messe',
|
||||
'horário das missas',
|
||||
];
|
||||
|
||||
const TOURISM_KEYWORDS = [
|
||||
'tourism', 'turismo', 'tourisme', 'turisme', 'touristik', 'turistico',
|
||||
'attractions', 'things to do', 'sightseeing', 'sehenswürdigkeiten',
|
||||
'what to see', 'places to visit', 'travel guide', 'reiseführer',
|
||||
'patrimoine', 'heritage trail', 'cultural heritage',
|
||||
'punto de interés', 'point of interest', 'points of interest',
|
||||
];
|
||||
|
||||
function getSignificantWords(name: string): string[] {
|
||||
const stopWords = new Set([
|
||||
'the', 'of', 'and', 'in', 'at', 'for', 'our', 'lady',
|
||||
'st', 'saint', 'saints', 'san', 'sant', 'santa', 'santo', 'sacred',
|
||||
'christ', 'jesus', 'mary', 'maria', 'king', 'lord', 'heart',
|
||||
'cross', 'lady', 'queen', 'angel', 'angels', 'good', 'star',
|
||||
'nome', 'pere', 'madre', 'notre', 'dame', 'bien',
|
||||
'onze', 'lieve', 'vrouw', 'heer',
|
||||
'rosa', 'paul', 'anne', 'jean', 'joan', 'luke', 'marc',
|
||||
'rita', 'jose', 'leon', 'pius', 'roch', 'yves', 'ines',
|
||||
'vita', 'fara', 'bona',
|
||||
'cristo', 'fatima', 'lourdes', 'perpetuo', 'socorro', 'calvario',
|
||||
'rosario', 'pilar', 'carmen', 'dolores', 'remedios', 'nieves',
|
||||
'grotte', 'mission', 'sagrada', 'sagrado', 'familia',
|
||||
'guadalupe', 'assumption', 'immaculate', 'perpetual', 'divine',
|
||||
'knights', 'columbus',
|
||||
'house', 'home', 'hall', 'center', 'centre', 'centro',
|
||||
'deacon', 'priest', 'bishop', 'father', 'sister', 'brother',
|
||||
'school', 'academy', 'college', 'seminary', 'rectory', 'retreat',
|
||||
'church', 'parish', 'catholic', 'roman', 'holy', 'chapel',
|
||||
'cathedral', 'basilica', 'shrine', 'convent', 'monastery',
|
||||
'chapelle', 'eglise', 'église', 'paroisse', 'couvent', 'grotte',
|
||||
'iglesia', 'parroquia', 'capilla', 'ermita', 'convento', 'basílica',
|
||||
'kirche', 'kapelle', 'pfarrei', 'kloster',
|
||||
'chiesa', 'parrocchia', 'cappella', 'oratorio',
|
||||
'igreja', 'capela', 'paroquia',
|
||||
'kościół', 'kaplica', 'parafia', 'droga',
|
||||
'kostel', 'kaple', 'farnost', 'templom', 'kápolna',
|
||||
'de', 'la', 'le', 'les', 'du', 'des', 'el', 'los', 'las',
|
||||
'di', 'del', 'della', 'delle', 'degli',
|
||||
'do', 'da', 'dos', 'das',
|
||||
'und', 'der', 'die', 'das', 'von',
|
||||
'nad', 'pod', 'przy',
|
||||
]);
|
||||
|
||||
return normalizeForMatch(name)
|
||||
.split(' ')
|
||||
.filter(w => w.length >= 3 && !stopWords.has(w));
|
||||
}
|
||||
|
||||
function stripHtml(html: string): string {
|
||||
return html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/&[a-z]+;/gi, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase();
|
||||
}
|
||||
|
||||
// --- URL Verification (same logic as enrich-with-freesearch.ts) ---
|
||||
|
||||
async function verifyUrl(url: string, church: ChurchRecord): Promise<boolean> {
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
timeout: 10000,
|
||||
maxRedirects: 3,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; NearestMass/1.0; +https://nearestmass.com)',
|
||||
'Accept': 'text/html',
|
||||
},
|
||||
maxContentLength: 200000,
|
||||
responseType: 'text',
|
||||
});
|
||||
|
||||
if (typeof response.data !== 'string') return false;
|
||||
|
||||
const text = stripHtml(response.data);
|
||||
const nameWords = getSignificantWords(church.name);
|
||||
|
||||
let nameMatches = 0;
|
||||
for (const word of nameWords) {
|
||||
if (text.includes(word)) nameMatches++;
|
||||
}
|
||||
|
||||
let cityMatch = false;
|
||||
if (church.city) {
|
||||
const cityNorm = normalizeForMatch(church.city);
|
||||
if (cityNorm.length > 2 && text.includes(cityNorm)) cityMatch = true;
|
||||
}
|
||||
|
||||
let addressMatch = false;
|
||||
if (church.address) {
|
||||
const addrNorm = normalizeForMatch(church.address);
|
||||
const addrWords = addrNorm.split(' ').filter(w => w.length >= 4 && !/^\d+$/.test(w));
|
||||
let addrWordMatches = 0;
|
||||
for (const w of addrWords) {
|
||||
if (text.includes(w)) addrWordMatches++;
|
||||
}
|
||||
if (addrWordMatches >= 2) addressMatch = true;
|
||||
}
|
||||
|
||||
let hasCatholicKeyword = false;
|
||||
for (const kw of CATHOLIC_KEYWORDS) {
|
||||
if (text.includes(kw)) { hasCatholicKeyword = true; break; }
|
||||
}
|
||||
|
||||
let hasMassSchedule = false;
|
||||
for (const kw of MASS_SCHEDULE_KEYWORDS) {
|
||||
if (text.includes(kw)) { hasMassSchedule = true; break; }
|
||||
}
|
||||
|
||||
let isTourismPage = false;
|
||||
for (const kw of TOURISM_KEYWORDS) {
|
||||
if (text.includes(kw)) { isTourismPage = true; break; }
|
||||
}
|
||||
|
||||
let domainMatchesName = false;
|
||||
try {
|
||||
const hostname = new URL(url).hostname.toLowerCase();
|
||||
for (const word of nameWords) {
|
||||
if (word.length >= 4 && hostname.includes(word)) {
|
||||
domainMatchesName = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
|
||||
if (isTourismPage && !hasMassSchedule) return false;
|
||||
|
||||
let isDeepUrl = false;
|
||||
try {
|
||||
const pathSegments = new URL(url).pathname.split('/').filter(Boolean);
|
||||
isDeepUrl = pathSegments.length > 2;
|
||||
} catch { /* ignore */ }
|
||||
if (isDeepUrl && !domainMatchesName && !hasMassSchedule) return false;
|
||||
|
||||
const hasCity = !!(church.city && church.city.trim());
|
||||
|
||||
if (hasMassSchedule && nameMatches >= 1) return true;
|
||||
if (domainMatchesName && nameMatches >= 1 && hasCatholicKeyword) return true;
|
||||
|
||||
if (hasCity) {
|
||||
if (nameMatches >= 2) return true;
|
||||
if (nameMatches >= 1 && cityMatch) return true;
|
||||
if (nameMatches >= 1 && addressMatch) return true;
|
||||
}
|
||||
|
||||
if (!hasCity) {
|
||||
if (nameMatches >= 1 && addressMatch) return true;
|
||||
if (nameMatches >= 3) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// --- ChromaDB Querying ---
|
||||
|
||||
interface ChromaResult {
|
||||
id: string;
|
||||
url: string;
|
||||
title: string;
|
||||
score: number;
|
||||
distance: number;
|
||||
churchId: string;
|
||||
churchName: string;
|
||||
churchCity: string;
|
||||
verified?: boolean;
|
||||
}
|
||||
|
||||
async function findCandidatesForChurch(
|
||||
church: ChurchRecord,
|
||||
collection: Collection,
|
||||
threshold: number,
|
||||
nResults: number
|
||||
): Promise<ChromaResult[]> {
|
||||
// Build identity text for semantic search
|
||||
const identityText = `${church.name} ${church.address || ''} ${church.city || ''} ${church.country}`.trim();
|
||||
const queryEmbedding = await embedSingle(identityText);
|
||||
|
||||
const results = await collection.query({
|
||||
queryEmbeddings: [queryEmbedding],
|
||||
nResults,
|
||||
where: { churchCountry: church.country },
|
||||
});
|
||||
|
||||
if (!results.ids[0]) return [];
|
||||
|
||||
return results.ids[0]
|
||||
.map((id, i) => {
|
||||
const metadata = results.metadatas[0][i] as Record<string, unknown>;
|
||||
return {
|
||||
id,
|
||||
url: (metadata.resultUrl as string) || '',
|
||||
title: (metadata.resultTitle as string) || '',
|
||||
score: (metadata.score as number) || 0,
|
||||
distance: results.distances?.[0]?.[i] ?? 1,
|
||||
churchId: (metadata.churchId as string) || '',
|
||||
churchName: (metadata.churchName as string) || '',
|
||||
churchCity: (metadata.churchCity as string) || '',
|
||||
verified: (metadata.verified as boolean) || false,
|
||||
};
|
||||
})
|
||||
.filter(r => r.distance <= threshold && r.url);
|
||||
}
|
||||
|
||||
function deduplicateByUrl(results: ChromaResult[]): ChromaResult[] {
|
||||
const seen = new Map<string, ChromaResult>();
|
||||
for (const r of results) {
|
||||
const existing = seen.get(r.url);
|
||||
if (!existing || r.distance < existing.distance) {
|
||||
seen.set(r.url, r);
|
||||
}
|
||||
}
|
||||
return [...seen.values()].sort((a, b) => a.distance - b.distance);
|
||||
}
|
||||
|
||||
// --- Main Processing ---
|
||||
|
||||
async function processChurch(
|
||||
church: ChurchRecord,
|
||||
collection: Collection,
|
||||
stats: MatchStats,
|
||||
threshold: number,
|
||||
dryRun: boolean
|
||||
): Promise<void> {
|
||||
const label = `${church.name} (${church.city || 'unknown'}, ${church.country})`;
|
||||
|
||||
try {
|
||||
// 1. Semantic search for similar results in ChromaDB
|
||||
const candidates = await findCandidatesForChurch(church, collection, threshold, 20);
|
||||
|
||||
if (candidates.length === 0) {
|
||||
log(` - ${label} => no ChromaDB results within threshold`);
|
||||
stats.noResults++;
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. Separate results: own church vs cross-church
|
||||
const ownResults = candidates.filter(r => r.churchId === church.id);
|
||||
const crossResults = candidates.filter(r => r.churchId !== church.id);
|
||||
|
||||
// 3. URL frequency: URLs appearing for multiple churches are likely real parish/diocese sites
|
||||
const urlFrequency = new Map<string, number>();
|
||||
for (const r of candidates) {
|
||||
urlFrequency.set(r.url, (urlFrequency.get(r.url) || 0) + 1);
|
||||
}
|
||||
|
||||
// 4. Prioritize: already-verified URLs from other churches, then high-frequency URLs,
|
||||
// then own-church results, then cross-church results
|
||||
const verifiedFromOthers = crossResults.filter(r => r.verified);
|
||||
const highFreqUrls = [...urlFrequency.entries()]
|
||||
.filter(([, count]) => count >= 2)
|
||||
.map(([url]) => url);
|
||||
|
||||
// Build candidate list in priority order
|
||||
const urlsToTry: string[] = [];
|
||||
const addUrl = (url: string) => {
|
||||
if (!urlsToTry.includes(url)) urlsToTry.push(url);
|
||||
};
|
||||
|
||||
// Verified URLs from nearby churches (highest priority)
|
||||
for (const r of verifiedFromOthers) addUrl(r.url);
|
||||
|
||||
// High-frequency URLs (appear in results for multiple churches)
|
||||
for (const url of highFreqUrls) addUrl(url);
|
||||
|
||||
// Own church results by distance (closest semantic match first)
|
||||
const dedupedOwn = deduplicateByUrl(ownResults);
|
||||
for (const r of dedupedOwn) addUrl(r.url);
|
||||
|
||||
// Cross-church results from same city
|
||||
const sameCityCross = crossResults.filter(r =>
|
||||
church.city && r.churchCity &&
|
||||
normalizeForMatch(r.churchCity) === normalizeForMatch(church.city)
|
||||
);
|
||||
const dedupedCross = deduplicateByUrl(sameCityCross);
|
||||
for (const r of dedupedCross) addUrl(r.url);
|
||||
|
||||
// Limit to top 5 candidates
|
||||
const topUrls = urlsToTry.slice(0, 5);
|
||||
|
||||
log(` ? ${label} => ${candidates.length} results, trying ${topUrls.length} candidates`);
|
||||
|
||||
// 5. Verify each candidate
|
||||
let verifiedUrl: string | null = null;
|
||||
for (const url of topUrls) {
|
||||
const ok = await verifyUrl(url, church);
|
||||
if (ok) {
|
||||
verifiedUrl = url;
|
||||
break;
|
||||
} else {
|
||||
stats.verifyFailed++;
|
||||
}
|
||||
}
|
||||
|
||||
if (verifiedUrl) {
|
||||
log(` + ${label} => ${verifiedUrl}`);
|
||||
stats.matched++;
|
||||
if (!dryRun) {
|
||||
await prisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: {
|
||||
website: verifiedUrl,
|
||||
hasWebsite: true,
|
||||
},
|
||||
});
|
||||
// Mark in ChromaDB (update replaces metadata, so include all fields)
|
||||
try {
|
||||
const matchingResult = candidates.find(r => r.url === verifiedUrl);
|
||||
if (matchingResult) {
|
||||
await collection.update({
|
||||
ids: [matchingResult.id],
|
||||
metadatas: [{
|
||||
churchId: matchingResult.churchId,
|
||||
churchName: matchingResult.churchName,
|
||||
churchCity: matchingResult.churchCity,
|
||||
churchCountry: church.country,
|
||||
searchQuery: '',
|
||||
resultUrl: verifiedUrl,
|
||||
resultTitle: matchingResult.title || '',
|
||||
score: matchingResult.score || 0,
|
||||
verified: true,
|
||||
}],
|
||||
});
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
} else {
|
||||
log(` ~ ${label} => ${topUrls.length} candidates failed verification`);
|
||||
stats.noResults++;
|
||||
}
|
||||
} catch (error: any) {
|
||||
stats.errors++;
|
||||
logError(` ! ${label} => error: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Main ---
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const countryIndex = args.indexOf('--country');
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const thresholdIndex = args.indexOf('--threshold');
|
||||
const dryRun = args.includes('--dry-run');
|
||||
|
||||
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
||||
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 500;
|
||||
const threshold = thresholdIndex !== -1 ? parseFloat(args[thresholdIndex + 1]) : 0.4;
|
||||
|
||||
// Graceful shutdown
|
||||
process.on('SIGTERM', () => { log('Received SIGTERM'); shuttingDown = true; });
|
||||
process.on('SIGINT', () => { log('Received SIGINT'); shuttingDown = true; });
|
||||
|
||||
log('============================================================');
|
||||
log('Second-Pass Search Result Matching');
|
||||
log('============================================================');
|
||||
log(`Country: ${countryCode || 'All'}`);
|
||||
log(`Limit: ${limit}`);
|
||||
log(`Threshold: ${threshold}`);
|
||||
log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
|
||||
log('============================================================');
|
||||
|
||||
// Connect to ChromaDB
|
||||
let collection: Collection;
|
||||
try {
|
||||
collection = await getCollection(COLLECTION_NAMES.SEARCH_RESULTS);
|
||||
log('ChromaDB search_results collection connected');
|
||||
} catch (e: any) {
|
||||
logError(`ChromaDB unavailable: ${e.message}`);
|
||||
logError('This script requires ChromaDB. Make sure it is running.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Check collection has data
|
||||
const count = await collection.count();
|
||||
log(`ChromaDB search_results: ${count} entries`);
|
||||
if (count === 0) {
|
||||
log('No search results stored yet. Run enrich-with-freesearch.ts first.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob({ countryCode, limit, threshold, dryRun });
|
||||
}
|
||||
log(`Job ID: ${jobId}`);
|
||||
|
||||
// Get churches without websites that have been FreeSearch'd
|
||||
const whereClause: Record<string, unknown> = {
|
||||
source: 'osm',
|
||||
website: null,
|
||||
freeSearchedAt: { not: null },
|
||||
};
|
||||
if (countryCode) {
|
||||
(whereClause as any).country = countryCode;
|
||||
}
|
||||
|
||||
const churches = await prisma.church.findMany({
|
||||
where: whereClause as any,
|
||||
select: {
|
||||
id: true, name: true, address: true, city: true, state: true,
|
||||
country: true, latitude: true, longitude: true,
|
||||
},
|
||||
take: limit,
|
||||
orderBy: { updatedAt: 'asc' },
|
||||
});
|
||||
|
||||
log(`Found ${churches.length} churches without websites (already FreeSearch'd)`);
|
||||
|
||||
const stats: MatchStats = {
|
||||
processed: 0,
|
||||
matched: 0,
|
||||
noResults: 0,
|
||||
verifyFailed: 0,
|
||||
errors: 0,
|
||||
startTime: Date.now(),
|
||||
};
|
||||
|
||||
for (const church of churches) {
|
||||
if (shuttingDown) break;
|
||||
stats.processed++;
|
||||
|
||||
await processChurch(church, collection, stats, threshold, dryRun);
|
||||
|
||||
// Job tracking every 10 items
|
||||
if (jobId && stats.processed % 10 === 0) {
|
||||
await updateJobProgress(jobId, stats.processed, stats.matched, churches.length);
|
||||
const stopping = await checkJobStopping(jobId);
|
||||
if (stopping) {
|
||||
log('Job stop requested via admin dashboard.');
|
||||
shuttingDown = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Progress logging every 50 items
|
||||
if (stats.processed % 50 === 0) {
|
||||
const elapsed = (Date.now() - stats.startTime) / 1000;
|
||||
const rate = Math.round((stats.processed / elapsed) * 3600);
|
||||
log(`Progress: ${stats.processed}/${churches.length} processed, ${stats.matched} matched, ${stats.noResults} no match, ${stats.errors} errors (~${rate}/hour)`);
|
||||
}
|
||||
}
|
||||
|
||||
// Complete job
|
||||
if (jobId) {
|
||||
await updateJobProgress(jobId, stats.processed, stats.matched, churches.length);
|
||||
await completeJob(jobId);
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const elapsed = ((Date.now() - stats.startTime) / 1000).toFixed(1);
|
||||
const matchRate = stats.processed > 0
|
||||
? ((stats.matched / stats.processed) * 100).toFixed(1)
|
||||
: '0.0';
|
||||
|
||||
log('');
|
||||
log('============================================================');
|
||||
log('Second-Pass Matching Summary');
|
||||
log('============================================================');
|
||||
log(`Churches processed: ${stats.processed}`);
|
||||
log(`Websites matched: ${stats.matched}`);
|
||||
log(`No match found: ${stats.noResults}`);
|
||||
log(`Verify rejected: ${stats.verifyFailed}`);
|
||||
log(`Errors: ${stats.errors}`);
|
||||
log(`Match rate: ${matchRate}%`);
|
||||
log(`Elapsed: ${elapsed}s`);
|
||||
log('============================================================');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
110
scripts/normalize-country-codes.ts
Normal file
110
scripts/normalize-country-codes.ts
Normal file
@@ -0,0 +1,110 @@
|
||||
/**
|
||||
* Normalize country codes in the database.
|
||||
* Converts full country names to ISO 3166-1 alpha-2 codes.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/normalize-country-codes.ts --dry-run
|
||||
* npx tsx scripts/normalize-country-codes.ts --execute
|
||||
*/
|
||||
|
||||
import path from 'path';
|
||||
import dotenv from 'dotenv';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { normalizeCountryCode } from '../src/lib/country-normalize';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
async function main() {
|
||||
const dryRun = !process.argv.includes('--execute');
|
||||
|
||||
if (dryRun) {
|
||||
console.log('DRY RUN — no changes will be made. Use --execute to apply.\n');
|
||||
}
|
||||
|
||||
// Get all distinct country values
|
||||
const countries = await prisma.church.findMany({
|
||||
select: { country: true },
|
||||
distinct: ['country'],
|
||||
where: { country: { not: null } },
|
||||
});
|
||||
|
||||
const countryValues = countries
|
||||
.map(c => c.country)
|
||||
.filter((c): c is string => c !== null);
|
||||
|
||||
console.log(`Found ${countryValues.length} distinct country values.\n`);
|
||||
|
||||
// Group by normalization result
|
||||
const changes: { original: string; normalized: string; count?: number }[] = [];
|
||||
const alreadyNormalized: string[] = [];
|
||||
const unknown: string[] = [];
|
||||
|
||||
for (const country of countryValues) {
|
||||
const normalized = normalizeCountryCode(country);
|
||||
|
||||
if (normalized === country) {
|
||||
// Already correct or unknown
|
||||
if (country.length === 2 && country === country.toUpperCase()) {
|
||||
alreadyNormalized.push(country);
|
||||
} else {
|
||||
unknown.push(country);
|
||||
}
|
||||
} else {
|
||||
changes.push({ original: country, normalized });
|
||||
}
|
||||
}
|
||||
|
||||
// Get counts for changes
|
||||
for (const change of changes) {
|
||||
const count = await prisma.church.count({
|
||||
where: { country: change.original },
|
||||
});
|
||||
change.count = count;
|
||||
}
|
||||
|
||||
// Report
|
||||
console.log(`Already normalized (${alreadyNormalized.length}): ${alreadyNormalized.sort().join(', ')}\n`);
|
||||
|
||||
if (changes.length > 0) {
|
||||
console.log(`Changes to apply (${changes.length}):`);
|
||||
for (const { original, normalized, count } of changes) {
|
||||
console.log(` "${original}" → "${normalized}" (${count} churches)`);
|
||||
}
|
||||
console.log();
|
||||
} else {
|
||||
console.log('No changes needed — all country values are already normalized.\n');
|
||||
}
|
||||
|
||||
if (unknown.length > 0) {
|
||||
console.log(`Unknown values (${unknown.length}): ${unknown.join(', ')}`);
|
||||
console.log(' These could not be mapped to ISO codes. Review manually.\n');
|
||||
}
|
||||
|
||||
// Apply changes
|
||||
if (!dryRun && changes.length > 0) {
|
||||
let totalUpdated = 0;
|
||||
for (const { original, normalized } of changes) {
|
||||
const result = await prisma.church.updateMany({
|
||||
where: { country: original },
|
||||
data: { country: normalized },
|
||||
});
|
||||
totalUpdated += result.count;
|
||||
console.log(`Updated "${original}" → "${normalized}": ${result.count} churches`);
|
||||
}
|
||||
console.log(`\nTotal updated: ${totalUpdated} churches`);
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('Error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
197
scripts/populate-chromadb.ts
Normal file
197
scripts/populate-chromadb.ts
Normal file
@@ -0,0 +1,197 @@
|
||||
/**
|
||||
* Bulk-populate ChromaDB collections from the database.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/populate-chromadb.ts --collection church_identity
|
||||
* npx tsx scripts/populate-chromadb.ts --collection page_classification
|
||||
* npx tsx scripts/populate-chromadb.ts --all
|
||||
* npx tsx scripts/populate-chromadb.ts --all --batch-size 50 --limit 1000
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { getCollection, COLLECTION_NAMES, CollectionName } from '../src/chromadb/collections';
|
||||
import { embed } from '../src/chromadb/embeddings';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const collectionArg = args.includes('--collection')
|
||||
? args[args.indexOf('--collection') + 1]
|
||||
: null;
|
||||
const populateAll = args.includes('--all');
|
||||
const batchSize = args.includes('--batch-size')
|
||||
? parseInt(args[args.indexOf('--batch-size') + 1])
|
||||
: 100;
|
||||
const limit = args.includes('--limit')
|
||||
? parseInt(args[args.indexOf('--limit') + 1])
|
||||
: 0;
|
||||
|
||||
async function populateChurchIdentity() {
|
||||
console.log('\n=== Populating church_identity ===');
|
||||
const collection = await getCollection(COLLECTION_NAMES.CHURCH_IDENTITY);
|
||||
|
||||
const totalCount = await prisma.church.count();
|
||||
const maxItems = limit > 0 ? Math.min(limit, totalCount) : totalCount;
|
||||
console.log(`Total churches: ${totalCount}, processing: ${maxItems}`);
|
||||
|
||||
let processed = 0;
|
||||
let cursor: string | undefined = undefined;
|
||||
|
||||
while (processed < maxItems) {
|
||||
const currentBatch = Math.min(batchSize, maxItems - processed);
|
||||
const churches = await prisma.church.findMany({
|
||||
take: currentBatch,
|
||||
...(cursor ? { skip: 1, cursor: { id: cursor } } : {}),
|
||||
orderBy: { id: 'asc' },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
address: true,
|
||||
city: true,
|
||||
country: true,
|
||||
source: true,
|
||||
latitude: true,
|
||||
longitude: true,
|
||||
},
|
||||
});
|
||||
|
||||
if (churches.length === 0) break;
|
||||
|
||||
const documents = churches.map(
|
||||
(c) => `${c.name} ${c.address || ''} ${c.city || ''} ${c.country}`.trim()
|
||||
);
|
||||
|
||||
const embeddings = await embed(documents);
|
||||
|
||||
await collection.upsert({
|
||||
ids: churches.map((c) => `church-${c.id}`),
|
||||
embeddings,
|
||||
documents,
|
||||
metadatas: churches.map((c) => ({
|
||||
churchId: c.id,
|
||||
country: c.country,
|
||||
source: c.source,
|
||||
lat: c.latitude,
|
||||
lng: c.longitude,
|
||||
})),
|
||||
});
|
||||
|
||||
processed += churches.length;
|
||||
cursor = churches[churches.length - 1].id;
|
||||
console.log(` Processed ${processed}/${maxItems}`);
|
||||
}
|
||||
|
||||
console.log(` Done: ${processed} churches indexed`);
|
||||
}
|
||||
|
||||
async function populatePageClassification() {
|
||||
console.log('\n=== Populating page_classification ===');
|
||||
const collection = await getCollection(COLLECTION_NAMES.PAGE_CLASSIFICATION);
|
||||
|
||||
// Index churches that have been successfully scraped (have mass schedules)
|
||||
const totalCount = await prisma.church.count({
|
||||
where: {
|
||||
lastScrapedAt: { not: null },
|
||||
massSchedules: { some: { isActive: true } },
|
||||
},
|
||||
});
|
||||
const maxItems = limit > 0 ? Math.min(limit, totalCount) : totalCount;
|
||||
console.log(`Scraped churches with schedules: ${totalCount}, processing: ${maxItems}`);
|
||||
|
||||
let processed = 0;
|
||||
let cursor: string | undefined = undefined;
|
||||
|
||||
while (processed < maxItems) {
|
||||
const currentBatch = Math.min(batchSize, maxItems - processed);
|
||||
const churches = await prisma.church.findMany({
|
||||
take: currentBatch,
|
||||
...(cursor ? { skip: 1, cursor: { id: cursor } } : {}),
|
||||
where: {
|
||||
lastScrapedAt: { not: null },
|
||||
massSchedules: { some: { isActive: true } },
|
||||
},
|
||||
orderBy: { id: 'asc' },
|
||||
select: {
|
||||
id: true,
|
||||
massScheduleUrl: true,
|
||||
website: true,
|
||||
websiteLanguage: true,
|
||||
scraperConfig: { select: { rawHtml: true } },
|
||||
},
|
||||
});
|
||||
|
||||
if (churches.length === 0) break;
|
||||
|
||||
// Use stored raw HTML (truncated) as the document
|
||||
const validChurches = churches.filter((c) => c.scraperConfig?.rawHtml);
|
||||
if (validChurches.length > 0) {
|
||||
const documents = validChurches.map(
|
||||
(c) => (c.scraperConfig?.rawHtml || '').slice(0, 2000)
|
||||
);
|
||||
|
||||
const embeddings = await embed(documents);
|
||||
|
||||
await collection.upsert({
|
||||
ids: validChurches.map((c) => `page-${c.id}`),
|
||||
embeddings,
|
||||
documents,
|
||||
metadatas: validChurches.map((c) => ({
|
||||
url: c.massScheduleUrl || c.website || '',
|
||||
isMassSchedulePage: true,
|
||||
language: c.websiteLanguage || 'unknown',
|
||||
})),
|
||||
});
|
||||
}
|
||||
|
||||
processed += churches.length;
|
||||
cursor = churches[churches.length - 1].id;
|
||||
console.log(` Processed ${processed}/${maxItems} (${validChurches.length} had raw HTML)`);
|
||||
}
|
||||
|
||||
console.log(` Done: ${processed} pages classified`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
if (!populateAll && !collectionArg) {
|
||||
console.log('Usage:');
|
||||
console.log(' npx tsx scripts/populate-chromadb.ts --collection church_identity');
|
||||
console.log(' npx tsx scripts/populate-chromadb.ts --collection page_classification');
|
||||
console.log(' npx tsx scripts/populate-chromadb.ts --all');
|
||||
console.log(' npx tsx scripts/populate-chromadb.ts --all --batch-size 50 --limit 1000');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const collectionsToPopulate: CollectionName[] = populateAll
|
||||
? [COLLECTION_NAMES.CHURCH_IDENTITY, COLLECTION_NAMES.PAGE_CLASSIFICATION]
|
||||
: [collectionArg as CollectionName];
|
||||
|
||||
for (const name of collectionsToPopulate) {
|
||||
switch (name) {
|
||||
case COLLECTION_NAMES.CHURCH_IDENTITY:
|
||||
await populateChurchIdentity();
|
||||
break;
|
||||
case COLLECTION_NAMES.PAGE_CLASSIFICATION:
|
||||
await populatePageClassification();
|
||||
break;
|
||||
default:
|
||||
console.log(`Collection '${name}' does not have a populate function yet.`);
|
||||
console.log('Available: church_identity, page_classification');
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\nPopulation complete!');
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
54
scripts/populate-city-normalized.ts
Normal file
54
scripts/populate-city-normalized.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
import { config } from 'dotenv';
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
// Load environment variables
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
// Create connection pool
|
||||
const connectionString = process.env.DATABASE_URL || '';
|
||||
const pool = new Pool({ connectionString });
|
||||
|
||||
// Create Prisma adapter
|
||||
const adapter = new PrismaPg(pool);
|
||||
|
||||
// Create Prisma client with adapter
|
||||
const prisma = new PrismaClient({
|
||||
adapter,
|
||||
log: ['error'],
|
||||
});
|
||||
|
||||
async function main() {
|
||||
console.log('Populating cityNormalized field using SQL...');
|
||||
|
||||
// Use raw SQL for much faster batch update
|
||||
// Normalize: lowercase, remove special chars except spaces/numbers, trim
|
||||
const result = await prisma.$executeRaw`
|
||||
UPDATE churches
|
||||
SET city_normalized = LOWER(
|
||||
TRIM(
|
||||
REGEXP_REPLACE(
|
||||
COALESCE(city, ''),
|
||||
'[^a-zA-Z0-9 ]',
|
||||
'',
|
||||
'g'
|
||||
)
|
||||
)
|
||||
)
|
||||
WHERE city IS NOT NULL
|
||||
`;
|
||||
|
||||
console.log(`✅ Updated ${result} churches with normalized cities`);
|
||||
}
|
||||
|
||||
main()
|
||||
.then(async () => {
|
||||
await prisma.$disconnect();
|
||||
})
|
||||
.catch(async (e) => {
|
||||
console.error(e);
|
||||
await prisma.$disconnect();
|
||||
process.exit(1);
|
||||
});
|
||||
161
scripts/save-schedules-to-db.ts
Normal file
161
scripts/save-schedules-to-db.ts
Normal file
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Save mass schedules to database using scrapeChurch() service
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config({ path: '.env.local' });
|
||||
config({ path: '.env' });
|
||||
|
||||
import { scrapeChurch } from '../src/lib/scraper-service';
|
||||
import { prisma } from '../src/lib/db';
|
||||
|
||||
const PRIORITY_COUNTRIES = ['FR', 'DE', 'ES', 'PL', 'BR'];
|
||||
const CHURCHES_PER_COUNTRY = 5; // Start small to verify it works
|
||||
|
||||
interface ScrapeResult {
|
||||
churchId: string;
|
||||
churchName: string;
|
||||
country: string;
|
||||
success: boolean;
|
||||
schedulesCreated: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function saveSchedulesToDb() {
|
||||
console.log('Starting database save operation...\n');
|
||||
console.log(`Target: ${CHURCHES_PER_COUNTRY} churches per country`);
|
||||
console.log(`Countries: ${PRIORITY_COUNTRIES.join(', ')}\n`);
|
||||
|
||||
const results: ScrapeResult[] = [];
|
||||
let totalChurches = 0;
|
||||
let totalSuccess = 0;
|
||||
let totalSchedules = 0;
|
||||
|
||||
for (const country of PRIORITY_COUNTRIES) {
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`${country} - Finding churches to scrape...`);
|
||||
console.log('='.repeat(60));
|
||||
|
||||
// Get churches with websites that haven't been scraped yet
|
||||
const churches = await prisma.church.findMany({
|
||||
where: {
|
||||
country,
|
||||
website: { not: null },
|
||||
source: 'osm',
|
||||
lastScrapedAt: null, // Only unscrapped churches
|
||||
},
|
||||
take: CHURCHES_PER_COUNTRY,
|
||||
orderBy: { createdAt: 'asc' },
|
||||
});
|
||||
|
||||
console.log(`Found ${churches.length} churches to scrape\n`);
|
||||
|
||||
for (let i = 0; i < churches.length; i++) {
|
||||
const church = churches[i];
|
||||
totalChurches++;
|
||||
|
||||
process.stdout.write(`[${i + 1}/${churches.length}] ${church.name.substring(0, 40).padEnd(40)} `);
|
||||
|
||||
try {
|
||||
// Use the scrapeChurch service which saves to database
|
||||
const result = await scrapeChurch(church.id);
|
||||
|
||||
if (result.success) {
|
||||
totalSuccess++;
|
||||
totalSchedules += result.schedulesCreated;
|
||||
process.stdout.write(`✅ ${result.schedulesCreated} schedules saved\n`);
|
||||
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
churchName: church.name,
|
||||
country,
|
||||
success: true,
|
||||
schedulesCreated: result.schedulesCreated,
|
||||
});
|
||||
} else {
|
||||
process.stdout.write(`❌ ${result.error}\n`);
|
||||
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
churchName: church.name,
|
||||
country,
|
||||
success: false,
|
||||
schedulesCreated: 0,
|
||||
error: result.error,
|
||||
});
|
||||
}
|
||||
} catch (err: any) {
|
||||
process.stdout.write(`❌ ERROR: ${err.message}\n`);
|
||||
|
||||
results.push({
|
||||
churchId: church.id,
|
||||
churchName: church.name,
|
||||
country,
|
||||
success: false,
|
||||
schedulesCreated: 0,
|
||||
error: err.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final summary
|
||||
console.log('\n\n');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('DATABASE SAVE SUMMARY');
|
||||
console.log('═'.repeat(80));
|
||||
console.log('');
|
||||
console.log(`Total churches processed: ${totalChurches}`);
|
||||
console.log(`Successful scrapes: ${totalSuccess} (${((totalSuccess / totalChurches) * 100).toFixed(1)}%)`);
|
||||
console.log(`Total schedules saved to database: ${totalSchedules}`);
|
||||
console.log('');
|
||||
|
||||
// Verify database records
|
||||
console.log('Verifying database records...\n');
|
||||
|
||||
const dbScheduleCount = await prisma.massSchedule.count();
|
||||
const dbChurchesWithSchedules = await prisma.church.count({
|
||||
where: {
|
||||
massSchedules: {
|
||||
some: {},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`✓ Total mass schedules in database: ${dbScheduleCount}`);
|
||||
console.log(`✓ Churches with schedules: ${dbChurchesWithSchedules}`);
|
||||
console.log('');
|
||||
|
||||
// Show sample of saved schedules
|
||||
console.log('Sample of saved schedules:\n');
|
||||
|
||||
const sampleChurches = await prisma.church.findMany({
|
||||
where: {
|
||||
massSchedules: {
|
||||
some: {},
|
||||
},
|
||||
},
|
||||
include: {
|
||||
massSchedules: {
|
||||
take: 3,
|
||||
orderBy: { dayOfWeek: 'asc' },
|
||||
},
|
||||
},
|
||||
take: 3,
|
||||
});
|
||||
|
||||
const dayNames = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||||
|
||||
sampleChurches.forEach(church => {
|
||||
console.log(`${church.name} (${church.country}):`);
|
||||
church.massSchedules.forEach(schedule => {
|
||||
console.log(` ${dayNames[schedule.dayOfWeek]} ${schedule.time} - ${schedule.language} ${schedule.massType || ''}`);
|
||||
});
|
||||
console.log('');
|
||||
});
|
||||
|
||||
await prisma.$disconnect();
|
||||
}
|
||||
|
||||
saveSchedulesToDb().catch(console.error);
|
||||
@@ -59,6 +59,13 @@ const PIPELINE_GROUPS: PipelineGroup[] = [
|
||||
{ name: 'masstimes-api-import', type: 'masstimes-api-import', config: {} },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'geocode-enrichment',
|
||||
mode: 'sequential',
|
||||
phases: [
|
||||
{ name: 'forward-geocode', type: 'forward-geocode-enrichment', config: { limit: 500 } },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'scrapers-batch-1',
|
||||
mode: 'parallel',
|
||||
@@ -138,6 +145,12 @@ function getJobCommand(type: string, language?: string | null, config?: Record<s
|
||||
if (config?.country) args.push('--country', String(config.country));
|
||||
return { command: 'npx', args };
|
||||
}
|
||||
case 'forward-geocode-enrichment': {
|
||||
const args = ['tsx', 'scripts/enrich-with-forward-geocode.ts'];
|
||||
if (limit) args.push('--limit', String(limit));
|
||||
if (config?.country) args.push('--country', String(config.country));
|
||||
return { command: 'npx', args };
|
||||
}
|
||||
case 'match-search-results': {
|
||||
const args = ['tsx', 'scripts/match-search-results.ts'];
|
||||
if (limit) args.push('--limit', String(limit));
|
||||
|
||||
299
scripts/scrape-churches.ts
Normal file
299
scripts/scrape-churches.ts
Normal file
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Bulk church website scraper
|
||||
* Scrapes mass schedules from church websites and updates the database.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/scrape-churches.ts --limit 100
|
||||
* npx tsx scripts/scrape-churches.ts --limit 50 --max-failures 3
|
||||
* npx tsx scripts/scrape-churches.ts --all # Process ALL eligible churches
|
||||
* npx tsx scripts/scrape-churches.ts --all --language english
|
||||
* npx tsx scripts/scrape-churches.ts --all --max-failures 3
|
||||
* npx tsx scripts/scrape-churches.ts --ids id1,id2,id3
|
||||
* npx tsx scripts/scrape-churches.ts --all --job-id <uuid> # Resume/track existing job
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { scrapeAllChurches, scrapeChurch, countEligibleChurches } from '../src/lib/scraper-service';
|
||||
import type { ScrapeJobResult } from '../src/lib/scraper-service';
|
||||
|
||||
// Fresh DB connection for scripts
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const jobPrisma = new PrismaClient({ adapter });
|
||||
|
||||
let shuttingDown = false;
|
||||
|
||||
function formatDuration(seconds: number): string {
|
||||
if (seconds < 60) return `${seconds.toFixed(0)}s`;
|
||||
if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${Math.floor(seconds % 60)}s`;
|
||||
const h = Math.floor(seconds / 3600);
|
||||
const m = Math.floor((seconds % 3600) / 60);
|
||||
return `${h}h ${m}m`;
|
||||
}
|
||||
|
||||
// --- Job Tracking ---
|
||||
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await jobPrisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function createNewJob(language: string | null, config: Record<string, unknown>): Promise<string> {
|
||||
const job = await jobPrisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'scraper',
|
||||
language: language || 'generic',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config,
|
||||
},
|
||||
});
|
||||
return job.id;
|
||||
}
|
||||
|
||||
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
|
||||
await jobPrisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { processed, succeeded, failed, itemsFound, totalItems },
|
||||
});
|
||||
}
|
||||
|
||||
async function checkJobStopping(jobId: string): Promise<boolean> {
|
||||
const job = await jobPrisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
return job?.status === 'stopping';
|
||||
}
|
||||
|
||||
async function completeJob(jobId: string, error?: string): Promise<void> {
|
||||
await jobPrisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: error ? 'failed' : 'completed',
|
||||
error,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const limitIndex = args.indexOf('--limit');
|
||||
const maxFailIndex = args.indexOf('--max-failures');
|
||||
const idsIndex = args.indexOf('--ids');
|
||||
const allMode = args.includes('--all');
|
||||
const langIndex = args.indexOf('--language');
|
||||
|
||||
const maxFailures = maxFailIndex !== -1 ? parseInt(args[maxFailIndex + 1]) : 5;
|
||||
const ids = idsIndex !== -1 ? args[idsIndex + 1].split(',') : null;
|
||||
const language = langIndex !== -1 ? args[langIndex + 1] : null;
|
||||
|
||||
// --ids mode: scrape specific churches
|
||||
if (ids) {
|
||||
console.log('============================================================');
|
||||
console.log('Church Website Scraper — Targeted Mode');
|
||||
console.log('============================================================');
|
||||
console.log(`Targeting ${ids.length} specific churches`);
|
||||
console.log(`Max failures: ${maxFailures}`);
|
||||
console.log(`Started: ${new Date().toISOString()}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
const startTime = Date.now();
|
||||
const results = await Promise.all(ids.map((id) => scrapeChurch(id.trim())));
|
||||
printSummary(results, startTime);
|
||||
return;
|
||||
}
|
||||
|
||||
// --all mode: batch loop through ALL eligible churches
|
||||
if (allMode) {
|
||||
const BATCH_SIZE = 100;
|
||||
const totalEligible = await countEligibleChurches(maxFailures);
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Church Website Scraper — Full Run');
|
||||
console.log('============================================================');
|
||||
console.log(`Language: ${language || 'all'}`);
|
||||
console.log(`Eligible churches: ${totalEligible.toLocaleString()}`);
|
||||
console.log(`Batch size: ${BATCH_SIZE}`);
|
||||
console.log(`Max failures: ${maxFailures}`);
|
||||
console.log(`Started: ${new Date().toISOString()}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
if (totalEligible === 0) {
|
||||
console.log('No eligible churches to scrape. All done!');
|
||||
return;
|
||||
}
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob(language, { allMode: true, maxFailures, language });
|
||||
}
|
||||
console.log(`Job ID: ${jobId}\n`);
|
||||
|
||||
// Graceful shutdown handlers
|
||||
process.on('SIGINT', () => {
|
||||
if (shuttingDown) {
|
||||
console.log('\nForce quit.');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log('\nShutting down gracefully (finishing current batch)...');
|
||||
shuttingDown = true;
|
||||
});
|
||||
process.on('SIGTERM', () => {
|
||||
console.log('\nSIGTERM received, shutting down after current batch...');
|
||||
shuttingDown = true;
|
||||
});
|
||||
|
||||
const allResults: ScrapeJobResult[] = [];
|
||||
const globalStart = Date.now();
|
||||
let batchNum = 0;
|
||||
let totalSchedulesFound = 0;
|
||||
|
||||
try {
|
||||
while (!shuttingDown) {
|
||||
batchNum++;
|
||||
const batchStart = Date.now();
|
||||
|
||||
const batchResults = await scrapeAllChurches({ limit: BATCH_SIZE, maxFailures, language: language || undefined });
|
||||
|
||||
if (batchResults.length === 0) {
|
||||
console.log('\nNo more eligible churches. All done!');
|
||||
break;
|
||||
}
|
||||
|
||||
allResults.push(...batchResults);
|
||||
|
||||
// Batch summary
|
||||
const batchElapsed = (Date.now() - batchStart) / 1000;
|
||||
const batchSuccess = batchResults.filter((r) => r.success).length;
|
||||
const batchSchedules = batchResults.reduce((sum, r) => sum + r.schedulesFound, 0);
|
||||
totalSchedulesFound += batchSchedules;
|
||||
|
||||
// Overall progress
|
||||
const totalElapsed = (Date.now() - globalStart) / 1000;
|
||||
const rate = allResults.length / (totalElapsed / 3600);
|
||||
const remaining = totalEligible - allResults.length;
|
||||
const etaSeconds = remaining > 0 && rate > 0 ? (remaining / rate) * 3600 : 0;
|
||||
|
||||
console.log(`\n--- Batch ${batchNum} (${batchResults.length} churches) ---`);
|
||||
console.log(` Success: ${batchSuccess}/${batchResults.length} | Schedules: ${batchSchedules} | Time: ${formatDuration(batchElapsed)}`);
|
||||
console.log(` Progress: ${allResults.length.toLocaleString()}/${totalEligible.toLocaleString()} (${((allResults.length / totalEligible) * 100).toFixed(1)}%)`);
|
||||
console.log(` Rate: ${rate.toFixed(0)}/hr | ETA: ~${formatDuration(etaSeconds)}`);
|
||||
|
||||
// Update job progress
|
||||
const succeeded = allResults.filter(r => r.success).length;
|
||||
const failed = allResults.filter(r => !r.success).length;
|
||||
await updateJobProgress(jobId, allResults.length, succeeded, failed, totalSchedulesFound, totalEligible);
|
||||
|
||||
// Check if job was requested to stop (every 10 items)
|
||||
if (allResults.length % 10 === 0) {
|
||||
const stopping = await checkJobStopping(jobId);
|
||||
if (stopping) {
|
||||
console.log('\nJob stop requested via admin dashboard.');
|
||||
shuttingDown = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (shuttingDown) {
|
||||
console.log('\nGraceful shutdown: batch completed.');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
await completeJob(jobId);
|
||||
} catch (error) {
|
||||
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
|
||||
throw error;
|
||||
}
|
||||
|
||||
printSummary(allResults, globalStart);
|
||||
return;
|
||||
}
|
||||
|
||||
// Default mode: single batch with --limit
|
||||
const limit = limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 100;
|
||||
|
||||
console.log('============================================================');
|
||||
console.log('Church Website Scraper');
|
||||
console.log('============================================================');
|
||||
console.log(`Language: ${language || 'all'}`);
|
||||
console.log(`Limit: ${limit}`);
|
||||
console.log(`Max failures: ${maxFailures}`);
|
||||
console.log(`Started: ${new Date().toISOString()}`);
|
||||
console.log('============================================================\n');
|
||||
|
||||
// Job tracking for single batch mode too
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob(language, { limit, maxFailures, language });
|
||||
}
|
||||
console.log(`Job ID: ${jobId}\n`);
|
||||
|
||||
const startTime = Date.now();
|
||||
try {
|
||||
const results = await scrapeAllChurches({ limit, maxFailures, language: language || undefined });
|
||||
const succeeded = results.filter(r => r.success).length;
|
||||
const failed = results.filter(r => !r.success).length;
|
||||
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
|
||||
await updateJobProgress(jobId, results.length, succeeded, failed, totalSchedules, limit);
|
||||
await completeJob(jobId);
|
||||
printSummary(results, startTime);
|
||||
} catch (error) {
|
||||
await completeJob(jobId, error instanceof Error ? error.message : 'Unknown error');
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
function printSummary(results: ScrapeJobResult[], startTime: number) {
|
||||
const elapsed = (Date.now() - startTime) / 1000;
|
||||
const succeeded = results.filter((r) => r.success);
|
||||
const failed = results.filter((r) => !r.success);
|
||||
const totalSchedules = results.reduce((sum, r) => sum + r.schedulesFound, 0);
|
||||
const rate = results.length / (elapsed / 3600);
|
||||
|
||||
console.log('\n============================================================');
|
||||
console.log('Scraping Summary');
|
||||
console.log('============================================================');
|
||||
console.log(`Churches processed: ${results.length.toLocaleString()}`);
|
||||
console.log(`Succeeded: ${succeeded.length.toLocaleString()}`);
|
||||
console.log(`Failed: ${failed.length.toLocaleString()}`);
|
||||
console.log(`Total schedules found: ${totalSchedules.toLocaleString()}`);
|
||||
console.log(`Elapsed time: ${formatDuration(elapsed)}`);
|
||||
console.log(`Average rate: ${rate.toFixed(0)}/hr`);
|
||||
console.log(`Finished: ${new Date().toISOString()}`);
|
||||
console.log('============================================================');
|
||||
|
||||
if (failed.length > 0) {
|
||||
console.log(`\nFailed churches (${failed.length}):`);
|
||||
// Show first 50 failures to avoid overwhelming output
|
||||
const toShow = failed.slice(0, 50);
|
||||
for (const f of toShow) {
|
||||
console.log(` - ${f.churchName}: ${f.error}`);
|
||||
}
|
||||
if (failed.length > 50) {
|
||||
console.log(` ... and ${failed.length - 50} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
}).finally(async () => {
|
||||
await jobPrisma.$disconnect();
|
||||
await pool.end();
|
||||
});
|
||||
372
scripts/scrape-diocese-directory.ts
Normal file
372
scripts/scrape-diocese-directory.ts
Normal file
@@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Scrape diocese directories to discover parish URLs and mass schedules
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --diocese <id> # Single diocese
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --country DE # All dioceses in country
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --all # All active dioceses
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --all --dry-run # Preview only
|
||||
* npx tsx scripts/scrape-diocese-directory.ts --job-id <uuid> # Resume tracked job
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function logError(msg: string) {
|
||||
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
|
||||
}
|
||||
|
||||
// Haversine distance in km
|
||||
function haversineKm(lat1: number, lon1: number, lat2: number, lon2: number): number {
|
||||
const R = 6371;
|
||||
const dLat = (lat2 - lat1) * Math.PI / 180;
|
||||
const dLon = (lon2 - lon1) * Math.PI / 180;
|
||||
const a = Math.sin(dLat / 2) ** 2 +
|
||||
Math.cos(lat1 * Math.PI / 180) * Math.cos(lat2 * Math.PI / 180) *
|
||||
Math.sin(dLon / 2) ** 2;
|
||||
return R * 2 * Math.asin(Math.sqrt(a));
|
||||
}
|
||||
|
||||
function normalizeForMatch(str: string): string {
|
||||
return str.toLowerCase()
|
||||
.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
|
||||
.replace(/[^a-z0-9\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
interface MatchCandidate {
|
||||
id: string;
|
||||
name: string;
|
||||
latitude: number;
|
||||
longitude: number;
|
||||
distance: number;
|
||||
nameScore: number;
|
||||
}
|
||||
|
||||
async function findMatchingChurch(
|
||||
name: string,
|
||||
address: string | undefined,
|
||||
city: string | undefined,
|
||||
country: string,
|
||||
): Promise<MatchCandidate | null> {
|
||||
// Search by name similarity + country
|
||||
const nameNorm = normalizeForMatch(name);
|
||||
const nameWords = nameNorm.split(' ').filter(w => w.length >= 3);
|
||||
|
||||
if (nameWords.length === 0) return null;
|
||||
|
||||
// Find churches in the same country
|
||||
const candidates = await prisma.church.findMany({
|
||||
where: {
|
||||
country,
|
||||
...(city ? { city: { contains: city, mode: 'insensitive' } } : {}),
|
||||
},
|
||||
select: { id: true, name: true, latitude: true, longitude: true, website: true },
|
||||
take: 50,
|
||||
});
|
||||
|
||||
let bestMatch: MatchCandidate | null = null;
|
||||
|
||||
for (const church of candidates) {
|
||||
const churchNameNorm = normalizeForMatch(church.name);
|
||||
const churchWords = churchNameNorm.split(' ').filter(w => w.length >= 3);
|
||||
|
||||
let matchingWords = 0;
|
||||
for (const w of nameWords) {
|
||||
if (churchWords.includes(w)) matchingWords++;
|
||||
}
|
||||
|
||||
const nameScore = nameWords.length > 0 ? matchingWords / nameWords.length : 0;
|
||||
|
||||
// Require at least 40% word overlap
|
||||
if (nameScore < 0.4) continue;
|
||||
|
||||
if (!bestMatch || nameScore > bestMatch.nameScore) {
|
||||
bestMatch = {
|
||||
id: church.id,
|
||||
name: church.name,
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
distance: 0,
|
||||
nameScore,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch;
|
||||
}
|
||||
|
||||
// --- Job Tracking ---
|
||||
|
||||
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
||||
const jobIdIndex = args.indexOf('--job-id');
|
||||
if (jobIdIndex !== -1) {
|
||||
const jobId = args[jobIdIndex + 1];
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
return jobId;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function scrapeDiocese(
|
||||
dioceseId: string,
|
||||
dryRun: boolean,
|
||||
stats: { processed: number; matched: number; created: number; schedules: number; errors: number }
|
||||
): Promise<void> {
|
||||
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
|
||||
if (!diocese) {
|
||||
logError(`Diocese not found: ${dioceseId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!diocese.directoryUrl) {
|
||||
log(` Skipping ${diocese.name}: no directory URL`);
|
||||
return;
|
||||
}
|
||||
|
||||
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
|
||||
if (!config?.selectors) {
|
||||
log(` Skipping ${diocese.name}: no scrape config`);
|
||||
return;
|
||||
}
|
||||
|
||||
log(`Scraping diocese: ${diocese.name} (${diocese.country})`);
|
||||
log(` Directory URL: ${diocese.directoryUrl}`);
|
||||
|
||||
const scraper = new DioceseDirectoryScraper();
|
||||
|
||||
try {
|
||||
let parishes;
|
||||
|
||||
if (config.scheduleInDirectory) {
|
||||
parishes = await scraper.scrapeDirectoryWithSchedules(
|
||||
diocese.directoryUrl,
|
||||
config,
|
||||
diocese.language
|
||||
);
|
||||
} else {
|
||||
const discovered = await scraper.scrapeDirectory(diocese.directoryUrl, config);
|
||||
parishes = discovered.map(p => ({
|
||||
...p,
|
||||
scheduleText: '',
|
||||
schedules: [] as Array<{ dayOfWeek: number; time: string; massType?: string; language?: string; notes?: string }>,
|
||||
}));
|
||||
}
|
||||
|
||||
log(` Discovered ${parishes.length} parishes`);
|
||||
|
||||
for (const parish of parishes) {
|
||||
stats.processed++;
|
||||
|
||||
// Try to match to existing church
|
||||
const match = await findMatchingChurch(
|
||||
parish.name,
|
||||
parish.address,
|
||||
parish.city,
|
||||
diocese.country,
|
||||
);
|
||||
|
||||
if (match) {
|
||||
stats.matched++;
|
||||
log(` Match: "${parish.name}" -> "${match.name}" (score: ${match.nameScore.toFixed(2)})`);
|
||||
|
||||
if (!dryRun) {
|
||||
// Update matched church with website and diocese link
|
||||
await prisma.church.update({
|
||||
where: { id: match.id },
|
||||
data: {
|
||||
website: parish.url,
|
||||
hasWebsite: true,
|
||||
dioceseId: diocese.id,
|
||||
},
|
||||
});
|
||||
|
||||
// Save schedules if available
|
||||
if ('schedules' in parish && parish.schedules.length > 0) {
|
||||
await prisma.massSchedule.deleteMany({ where: { churchId: match.id } });
|
||||
await prisma.massSchedule.createMany({
|
||||
data: parish.schedules.map(s => ({
|
||||
churchId: match.id,
|
||||
dayOfWeek: s.dayOfWeek,
|
||||
time: s.time,
|
||||
massType: s.massType,
|
||||
language: s.language ?? 'English',
|
||||
notes: s.notes,
|
||||
})),
|
||||
});
|
||||
stats.schedules += parish.schedules.length;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log(` No match: "${parish.name}" (${parish.city || 'no city'})`);
|
||||
stats.created++;
|
||||
|
||||
// In non-dry-run, we could create new churches, but for safety
|
||||
// we only log unmatched parishes for manual review
|
||||
// (Creating churches from directory data without coordinates is risky)
|
||||
}
|
||||
}
|
||||
|
||||
// Update diocese tracking
|
||||
if (!dryRun) {
|
||||
await prisma.diocese.update({
|
||||
where: { id: diocese.id },
|
||||
data: {
|
||||
lastScrapedAt: new Date(),
|
||||
lastSuccessAt: new Date(),
|
||||
churchCount: parishes.length,
|
||||
failureCount: 0,
|
||||
},
|
||||
});
|
||||
}
|
||||
} catch (err: any) {
|
||||
stats.errors++;
|
||||
logError(` Failed to scrape ${diocese.name}: ${err.message}`);
|
||||
|
||||
if (!dryRun) {
|
||||
await prisma.diocese.update({
|
||||
where: { id: diocese.id },
|
||||
data: {
|
||||
lastScrapedAt: new Date(),
|
||||
lastFailureAt: new Date(),
|
||||
failureCount: { increment: 1 },
|
||||
},
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const dryRun = args.includes('--dry-run');
|
||||
const dioceseIdx = args.indexOf('--diocese');
|
||||
const countryIdx = args.indexOf('--country');
|
||||
const all = args.includes('--all');
|
||||
|
||||
const dioceseId = dioceseIdx !== -1 ? args[dioceseIdx + 1] : undefined;
|
||||
const country = countryIdx !== -1 ? args[countryIdx + 1] : undefined;
|
||||
|
||||
log('============================================================');
|
||||
log('Diocese Directory Scraper');
|
||||
log('============================================================');
|
||||
log(`Mode: ${dryRun ? 'Dry run' : 'Execute'}`);
|
||||
log(`Target: ${dioceseId ? `Diocese ${dioceseId}` : country ? `Country ${country}` : 'All active'}`);
|
||||
log('============================================================');
|
||||
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId && !dryRun) {
|
||||
const job = await prisma.backgroundJob.create({
|
||||
data: {
|
||||
type: 'diocese-directory',
|
||||
status: 'running',
|
||||
startedAt: new Date(),
|
||||
config: { dioceseId, country, all, dryRun },
|
||||
},
|
||||
});
|
||||
jobId = job.id;
|
||||
log(`Job ID: ${jobId}`);
|
||||
}
|
||||
|
||||
const stats = { processed: 0, matched: 0, created: 0, schedules: 0, errors: 0 };
|
||||
|
||||
try {
|
||||
let dioceses;
|
||||
|
||||
if (dioceseId) {
|
||||
dioceses = [{ id: dioceseId }];
|
||||
} else {
|
||||
dioceses = await prisma.diocese.findMany({
|
||||
where: {
|
||||
active: true,
|
||||
directoryUrl: { not: null },
|
||||
...(country ? { country } : {}),
|
||||
},
|
||||
select: { id: true, name: true },
|
||||
orderBy: { name: 'asc' },
|
||||
});
|
||||
}
|
||||
|
||||
log(`Found ${dioceses.length} dioceses to scrape`);
|
||||
|
||||
for (const d of dioceses) {
|
||||
await scrapeDiocese(d.id, dryRun, stats);
|
||||
|
||||
// Check for job stop
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { processed: stats.processed, succeeded: stats.matched, itemsFound: stats.matched },
|
||||
});
|
||||
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
if (job?.status === 'stopping') {
|
||||
log('Job stop requested.');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'failed', error: error.message, completedAt: new Date() },
|
||||
});
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Complete job
|
||||
if (jobId) {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: 'completed',
|
||||
completedAt: new Date(),
|
||||
processed: stats.processed,
|
||||
succeeded: stats.matched,
|
||||
itemsFound: stats.matched,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
log('');
|
||||
log('============================================================');
|
||||
log('Diocese Directory Scraper Summary');
|
||||
log('============================================================');
|
||||
log(`Parishes discovered: ${stats.processed}`);
|
||||
log(`Matched to DB: ${stats.matched}`);
|
||||
log(`Unmatched (new): ${stats.created}`);
|
||||
log(`Schedules saved: ${stats.schedules}`);
|
||||
log(`Errors: ${stats.errors}`);
|
||||
log('============================================================');
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
171
scripts/scrape-masstimes.ts
Normal file
171
scripts/scrape-masstimes.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
import 'dotenv/config';
|
||||
import { prisma } from '../src/lib/db';
|
||||
import { MassTimesScraper, ChurchData } from '../src/lib/masstimes-scraper';
|
||||
|
||||
const TARGET_STATES = [
|
||||
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL',
|
||||
'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME',
|
||||
'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH',
|
||||
'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
|
||||
'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI',
|
||||
'WY',
|
||||
];
|
||||
|
||||
function deduplicateMassSchedules<T extends { dayOfWeek: number; time: string; language: string }>(schedules: T[]): T[] {
|
||||
const seen = new Map<string, T>();
|
||||
for (const s of schedules) {
|
||||
const key = `${s.dayOfWeek}:${s.time}:${s.language}`;
|
||||
if (!seen.has(key)) {
|
||||
seen.set(key, s);
|
||||
}
|
||||
}
|
||||
return Array.from(seen.values());
|
||||
}
|
||||
|
||||
async function saveChurch(data: ChurchData, seenIds: Set<string>): Promise<boolean> {
|
||||
if (seenIds.has(data.masstimesId)) {
|
||||
console.log(` Skipping duplicate: ${data.name}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
await prisma.$transaction(async (tx) => {
|
||||
const church = await tx.church.upsert({
|
||||
where: { masstimesId: data.masstimesId },
|
||||
create: {
|
||||
masstimesId: data.masstimesId,
|
||||
name: data.name,
|
||||
address: data.address,
|
||||
city: data.city,
|
||||
state: data.state,
|
||||
zip: data.zip,
|
||||
country: data.country,
|
||||
latitude: data.latitude,
|
||||
longitude: data.longitude,
|
||||
phone: data.phone,
|
||||
website: data.website,
|
||||
email: data.email,
|
||||
pastorName: data.pastorName,
|
||||
diocese: data.diocese,
|
||||
directions: data.directions,
|
||||
wheelchairAccess: data.wheelchairAccess,
|
||||
lastScrapedAt: new Date(),
|
||||
scrapeStrategy: 'masstimes',
|
||||
},
|
||||
update: {
|
||||
name: data.name,
|
||||
address: data.address,
|
||||
city: data.city,
|
||||
state: data.state,
|
||||
zip: data.zip,
|
||||
latitude: data.latitude,
|
||||
longitude: data.longitude,
|
||||
phone: data.phone,
|
||||
website: data.website,
|
||||
email: data.email,
|
||||
pastorName: data.pastorName,
|
||||
diocese: data.diocese,
|
||||
directions: data.directions,
|
||||
wheelchairAccess: data.wheelchairAccess,
|
||||
lastScrapedAt: new Date(),
|
||||
},
|
||||
});
|
||||
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: church.id } });
|
||||
await tx.confessionSchedule.deleteMany({ where: { churchId: church.id } });
|
||||
await tx.adorationSchedule.deleteMany({ where: { churchId: church.id } });
|
||||
|
||||
if (data.massSchedules.length > 0) {
|
||||
await tx.massSchedule.createMany({
|
||||
data: deduplicateMassSchedules(data.massSchedules).map((ms) => ({
|
||||
churchId: church.id,
|
||||
dayOfWeek: ms.dayOfWeek,
|
||||
time: ms.time,
|
||||
massType: ms.massType,
|
||||
language: ms.language,
|
||||
notes: ms.notes,
|
||||
})),
|
||||
});
|
||||
}
|
||||
|
||||
if (data.confessionSchedules.length > 0) {
|
||||
await tx.confessionSchedule.createMany({
|
||||
data: data.confessionSchedules.map((cs) => ({
|
||||
churchId: church.id,
|
||||
dayOfWeek: cs.dayOfWeek,
|
||||
startTime: cs.startTime,
|
||||
endTime: cs.endTime,
|
||||
notes: cs.notes,
|
||||
})),
|
||||
});
|
||||
}
|
||||
|
||||
if (data.adorationSchedules.length > 0) {
|
||||
await tx.adorationSchedule.createMany({
|
||||
data: data.adorationSchedules.map((as) => ({
|
||||
churchId: church.id,
|
||||
dayOfWeek: as.dayOfWeek,
|
||||
startTime: as.startTime,
|
||||
endTime: as.endTime,
|
||||
isPerpetual: as.isPerpetual,
|
||||
notes: as.notes,
|
||||
})),
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
seenIds.add(data.masstimesId);
|
||||
console.log(` Saved: ${data.name}`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error(` Error saving ${data.name}:`, error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const seenIds = new Set<string>();
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('MASSTIMES.ORG CHURCH SCRAPER (JSON API)');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`\nTarget states: ${TARGET_STATES.length}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
|
||||
const scraper = new MassTimesScraper();
|
||||
const stats = { total: 0, saved: 0, errors: 0 };
|
||||
|
||||
try {
|
||||
await scraper.init();
|
||||
console.log('Browser initialized\n');
|
||||
|
||||
for (let i = 0; i < TARGET_STATES.length; i++) {
|
||||
const state = TARGET_STATES[i];
|
||||
console.log(`\n[${'='.repeat(20)}] SCRAPING ${state} [${'='.repeat(20)}]\n`);
|
||||
console.log(`State ${i + 1}/${TARGET_STATES.length}: ${state}`);
|
||||
const churches = await scraper.scrapeState(state);
|
||||
stats.total += churches.length;
|
||||
console.log(`\n Saving ${churches.length} churches from ${state} to database...`);
|
||||
for (const church of churches) {
|
||||
const saved = await saveChurch(church, seenIds);
|
||||
if (saved) stats.saved++;
|
||||
else stats.errors++;
|
||||
}
|
||||
console.log(`\n Resting 5 minutes before next state...\n`);
|
||||
await new Promise(resolve => setTimeout(resolve, 300000));
|
||||
}
|
||||
} finally {
|
||||
await scraper.close();
|
||||
await prisma.$disconnect();
|
||||
}
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('SUMMARY');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Total scraped: ${stats.total}`);
|
||||
console.log(`Saved: ${stats.saved}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
328
scripts/setup-diocese.ts
Executable file
328
scripts/setup-diocese.ts
Executable file
@@ -0,0 +1,328 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Interactive helper to configure a new diocese for scraping
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de
|
||||
* npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr
|
||||
* npx tsx scripts/setup-diocese.ts --list # List all configured dioceses
|
||||
* npx tsx scripts/setup-diocese.ts --test <diocese-id> # Test scraping a diocese
|
||||
*/
|
||||
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { DioceseDirectoryScraper, DioceseScrapeConfig } from '../src/scrapers/diocese-directory-scraper';
|
||||
import readline from 'readline';
|
||||
|
||||
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
function logError(msg: string) {
|
||||
console.error(`[${new Date().toISOString()}] ERROR: ${msg}`);
|
||||
}
|
||||
|
||||
function ask(question: string): Promise<string> {
|
||||
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
||||
return new Promise(resolve => {
|
||||
rl.question(question, answer => {
|
||||
rl.close();
|
||||
resolve(answer.trim());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function listDioceses() {
|
||||
const dioceses = await prisma.diocese.findMany({
|
||||
orderBy: [{ country: 'asc' }, { name: 'asc' }],
|
||||
});
|
||||
|
||||
if (dioceses.length === 0) {
|
||||
log('No dioceses configured yet.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log('\nConfigured Dioceses:');
|
||||
console.log('─'.repeat(100));
|
||||
console.log(
|
||||
'ID'.padEnd(38) +
|
||||
'Name'.padEnd(30) +
|
||||
'Country'.padEnd(10) +
|
||||
'Active'.padEnd(8) +
|
||||
'Churches'.padEnd(10) +
|
||||
'Last Scraped'
|
||||
);
|
||||
console.log('─'.repeat(100));
|
||||
|
||||
for (const d of dioceses) {
|
||||
console.log(
|
||||
d.id.padEnd(38) +
|
||||
d.name.substring(0, 28).padEnd(30) +
|
||||
d.country.padEnd(10) +
|
||||
(d.active ? 'Yes' : 'No').padEnd(8) +
|
||||
String(d.churchCount).padEnd(10) +
|
||||
(d.lastScrapedAt ? d.lastScrapedAt.toISOString().split('T')[0] : 'Never')
|
||||
);
|
||||
}
|
||||
|
||||
console.log('─'.repeat(100));
|
||||
console.log(`Total: ${dioceses.length} dioceses`);
|
||||
}
|
||||
|
||||
async function testDiocese(dioceseId: string) {
|
||||
const diocese = await prisma.diocese.findUnique({ where: { id: dioceseId } });
|
||||
if (!diocese) {
|
||||
logError(`Diocese not found: ${dioceseId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!diocese.directoryUrl) {
|
||||
logError(`Diocese ${diocese.name} has no directory URL`);
|
||||
return;
|
||||
}
|
||||
|
||||
const config = diocese.scrapeConfig as DioceseScrapeConfig | null;
|
||||
if (!config?.selectors) {
|
||||
logError(`Diocese ${diocese.name} has no scrape config`);
|
||||
return;
|
||||
}
|
||||
|
||||
log(`Testing diocese: ${diocese.name}`);
|
||||
log(`Directory URL: ${diocese.directoryUrl}`);
|
||||
log('');
|
||||
|
||||
const scraper = new DioceseDirectoryScraper();
|
||||
try {
|
||||
const parishes = await scraper.scrapeDirectory(diocese.directoryUrl, config);
|
||||
|
||||
log(`\nDiscovered ${parishes.length} parishes:\n`);
|
||||
for (const p of parishes.slice(0, 10)) {
|
||||
console.log(` ${p.name}`);
|
||||
console.log(` URL: ${p.url}`);
|
||||
if (p.address) console.log(` Address: ${p.address}`);
|
||||
if (p.city) console.log(` City: ${p.city}`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (parishes.length > 10) {
|
||||
console.log(` ... and ${parishes.length - 10} more`);
|
||||
}
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function setupDiocese(url: string, country: string, language: string) {
|
||||
log(`Setting up diocese from: ${url}`);
|
||||
log(`Country: ${country}, Language: ${language}`);
|
||||
|
||||
// Ask for diocese name
|
||||
const name = await ask('\nDiocese name (e.g. "Bistum Mainz"): ');
|
||||
if (!name) {
|
||||
logError('Name is required');
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if already exists
|
||||
const existing = await prisma.diocese.findFirst({
|
||||
where: { name, country },
|
||||
});
|
||||
if (existing) {
|
||||
logError(`Diocese "${name}" already exists in ${country} (ID: ${existing.id})`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Probe the page structure
|
||||
log('\nProbing page structure...');
|
||||
const scraper = new DioceseDirectoryScraper();
|
||||
await scraper.init();
|
||||
|
||||
try {
|
||||
const page = (scraper as any).page;
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
|
||||
// Analyze page - count links and common patterns
|
||||
const analysis = await page.evaluate(() => {
|
||||
const links = Array.from(document.querySelectorAll('a'));
|
||||
const linkPatterns: Record<string, number> = {};
|
||||
|
||||
for (const link of links) {
|
||||
const href = link.href;
|
||||
if (!href) continue;
|
||||
// Extract pattern from URL path
|
||||
try {
|
||||
const path = new URL(href).pathname;
|
||||
const segments = path.split('/').filter(Boolean);
|
||||
if (segments.length >= 1) {
|
||||
const pattern = '/' + segments.slice(0, -1).join('/') + '/*';
|
||||
linkPatterns[pattern] = (linkPatterns[pattern] || 0) + 1;
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
|
||||
// Find most common list-like elements
|
||||
const listSelectors = [
|
||||
'ul li', 'ol li', 'div.parish', 'div.item', 'article',
|
||||
'tr', '.card', '.entry', '.listing', '.result',
|
||||
];
|
||||
|
||||
const selectorCounts: Record<string, number> = {};
|
||||
for (const sel of listSelectors) {
|
||||
selectorCounts[sel] = document.querySelectorAll(sel).length;
|
||||
}
|
||||
|
||||
return {
|
||||
title: document.title,
|
||||
totalLinks: links.length,
|
||||
linkPatterns: Object.entries(linkPatterns)
|
||||
.sort(([, a], [, b]) => b - a)
|
||||
.slice(0, 10),
|
||||
selectorCounts,
|
||||
bodyTextLength: document.body?.textContent?.length || 0,
|
||||
};
|
||||
});
|
||||
|
||||
console.log(`\nPage: ${analysis.title}`);
|
||||
console.log(`Total links: ${analysis.totalLinks}`);
|
||||
console.log(`\nMost common link patterns:`);
|
||||
for (const [pattern, count] of analysis.linkPatterns) {
|
||||
console.log(` ${pattern}: ${count} links`);
|
||||
}
|
||||
console.log(`\nElement counts:`);
|
||||
for (const [sel, count] of Object.entries(analysis.selectorCounts)) {
|
||||
if (count > 0) console.log(` ${sel}: ${count}`);
|
||||
}
|
||||
|
||||
// Ask for selectors
|
||||
console.log('\nNow configure CSS selectors for this diocese.\n');
|
||||
|
||||
const parishList = await ask('Parish list container selector (e.g. "ul.parishes li", ".parish-item"): ');
|
||||
const parishLink = await ask('Parish link selector within container (e.g. "a", "a.parish-link"): ');
|
||||
const parishName = await ask('Parish name selector (leave empty to use link text): ') || undefined;
|
||||
const parishAddress = await ask('Address selector (leave empty if none): ') || undefined;
|
||||
const parishCity = await ask('City selector (leave empty if none): ') || undefined;
|
||||
const pagination = await ask('Pagination "next" selector (leave empty if none): ') || undefined;
|
||||
const urlPatternStr = await ask('URL pattern regex (leave empty for all): ') || undefined;
|
||||
const waitForSelector = await ask('Wait for selector (leave empty if not needed): ') || undefined;
|
||||
|
||||
const scrapeConfig: DioceseScrapeConfig = {
|
||||
selectors: {
|
||||
parishList,
|
||||
parishLink,
|
||||
parishName,
|
||||
parishAddress,
|
||||
parishCity,
|
||||
pagination,
|
||||
},
|
||||
urlPattern: urlPatternStr,
|
||||
waitForSelector,
|
||||
maxPages: 50,
|
||||
scheduleInDirectory: false,
|
||||
};
|
||||
|
||||
// Test the config
|
||||
console.log('\nTesting selectors...');
|
||||
const testResults = await page.$$eval(
|
||||
parishList,
|
||||
(elements: Element[], linkSel: string) => {
|
||||
return elements.slice(0, 5).map(el => {
|
||||
const link = el.querySelector(linkSel);
|
||||
return {
|
||||
name: link?.textContent?.trim() || el.textContent?.trim()?.substring(0, 80) || '(empty)',
|
||||
url: link?.getAttribute('href') || '(no link)',
|
||||
};
|
||||
});
|
||||
},
|
||||
parishLink
|
||||
);
|
||||
|
||||
console.log(`\nTest extraction (first 5):`);
|
||||
for (const r of testResults) {
|
||||
console.log(` ${r.name}`);
|
||||
console.log(` -> ${r.url}`);
|
||||
}
|
||||
|
||||
const confirm = await ask('\nSave this configuration? (yes/no): ');
|
||||
if (confirm.toLowerCase() !== 'yes' && confirm.toLowerCase() !== 'y') {
|
||||
log('Cancelled.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Save to database
|
||||
const diocese = await prisma.diocese.create({
|
||||
data: {
|
||||
name,
|
||||
country,
|
||||
language,
|
||||
website: new URL(url).origin,
|
||||
directoryUrl: url,
|
||||
scrapeConfig: scrapeConfig as any,
|
||||
active: true,
|
||||
},
|
||||
});
|
||||
|
||||
log(`\nDiocese saved! ID: ${diocese.id}`);
|
||||
log(`Run: npx tsx scripts/scrape-diocese-directory.ts --diocese ${diocese.id} --dry-run`);
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
if (args.includes('--list')) {
|
||||
await listDioceses();
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const testIdx = args.indexOf('--test');
|
||||
if (testIdx !== -1) {
|
||||
await testDiocese(args[testIdx + 1]);
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const urlIdx = args.indexOf('--url');
|
||||
const countryIdx = args.indexOf('--country');
|
||||
const langIdx = args.indexOf('--language');
|
||||
|
||||
if (urlIdx === -1 || countryIdx === -1) {
|
||||
console.log('Usage:');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --url <directory-url> --country <CC> --language <lang>');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --list');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --test <diocese-id>');
|
||||
console.log('');
|
||||
console.log('Examples:');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --url https://bistum-mainz.de/pfarreien --country DE --language de');
|
||||
console.log(' npx tsx scripts/setup-diocese.ts --url https://diocese-paris.fr/paroisses --country FR --language fr');
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const url = args[urlIdx + 1];
|
||||
const country = args[countryIdx + 1];
|
||||
const language = langIdx !== -1 ? args[langIdx + 1] : country.toLowerCase();
|
||||
|
||||
await setupDiocese(url, country, language);
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
logError(`Fatal error: ${error.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
397
scripts/test-edge-cases.ts
Normal file
397
scripts/test-edge-cases.ts
Normal file
@@ -0,0 +1,397 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Comprehensive edge case test suite for the international mass scraper
|
||||
*
|
||||
* This test suite validates all edge cases discovered and fixed during development:
|
||||
* 1. Day range expansion (Monday-Friday, wtorek-sobota, etc.)
|
||||
* 2. Office hours filtering (öffnungszeiten, horario, kancelaria, etc.)
|
||||
* 3. Short abbreviation word boundaries (pn, cz, n in Polish)
|
||||
* 4. Invalid time filtering (00:00-04:59)
|
||||
* 5. Deduplication (same schedule appearing multiple times)
|
||||
* 6. Context-based scoring (mass schedule vs office hours)
|
||||
* 7. "Closed" notice filtering (nieczynna, fermé, cerrado, etc.)
|
||||
*/
|
||||
|
||||
import { GenericScraper } from '../src/scrapers/strategies/generic';
|
||||
|
||||
interface EdgeCaseTest {
|
||||
name: string;
|
||||
url: string;
|
||||
country: string;
|
||||
language: string;
|
||||
edgeCases: string[];
|
||||
expectations: {
|
||||
minSchedules?: number;
|
||||
maxSchedules?: number;
|
||||
shouldHaveDays?: number[]; // 0=Sun, 1=Mon, etc.
|
||||
shouldNotHaveTimes?: string[]; // Invalid times that should be filtered
|
||||
shouldHaveTimes?: string[]; // Valid times that should be found
|
||||
};
|
||||
knownIssues?: string[];
|
||||
}
|
||||
|
||||
const edgeCaseTests: EdgeCaseTest[] = [
|
||||
// POLISH - Day ranges, office hours, short abbreviations
|
||||
{
|
||||
name: 'Parafia Lubojna (PL)',
|
||||
url: 'http://parafialubojna.pl',
|
||||
country: 'PL',
|
||||
language: 'Polish',
|
||||
edgeCases: [
|
||||
'Day range: "wtorek - sobota" (Tuesday-Saturday)',
|
||||
'Office hours: "kancelaria czynna" with times',
|
||||
'Short abbreviations: "pn", "cz", "n" in words like "sierpniu", "uroczystości"',
|
||||
'"Closed" notice: "nieczynna: niedziela, poniedziałek"',
|
||||
'Space-separated times: "8 00", "9 30", "18 00"',
|
||||
],
|
||||
expectations: {
|
||||
minSchedules: 10,
|
||||
maxSchedules: 10,
|
||||
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All 7 days
|
||||
shouldHaveTimes: ['08:00', '09:30', '11:00', '16:00', '18:00'],
|
||||
shouldNotHaveTimes: ['18:30', '19:00', '09:00'], // Office hours times
|
||||
},
|
||||
},
|
||||
|
||||
// GERMAN - Office hours, Uhr format, duplicates
|
||||
{
|
||||
name: 'St. Peter, Munich (DE)',
|
||||
url: 'https://www.alterpeter.de/',
|
||||
country: 'DE',
|
||||
language: 'German',
|
||||
edgeCases: [
|
||||
'Office hours: "öffnungszeiten im pfarrbüro: montag bis donnerstag 9.00 – 12.00"',
|
||||
'Day range: "montag bis donnerstag" (Monday to Thursday)',
|
||||
'Uhr time format: "10:00 uhr", "17.15 Uhr"',
|
||||
'Invalid time: "00 uhr" from fragmented "10:00 uhr"',
|
||||
'Duplicates: Same schedule in current week + general schedule',
|
||||
'Multi-church parish: Different churches with different times',
|
||||
],
|
||||
expectations: {
|
||||
minSchedules: 10,
|
||||
maxSchedules: 20,
|
||||
shouldHaveDays: [0, 6], // At minimum Sunday and Saturday
|
||||
shouldNotHaveTimes: ['09:00', '12:00', '14:00', '16:00', '00:00'], // Office hours + invalid
|
||||
},
|
||||
},
|
||||
|
||||
// ITALIAN - Period separator
|
||||
{
|
||||
name: 'Duomo di Milano (IT)',
|
||||
url: 'https://www.duomomilano.it/',
|
||||
country: 'IT',
|
||||
language: 'Italian',
|
||||
edgeCases: [
|
||||
'Period separator: "18.30", "9.00"',
|
||||
'Day ranges: "da lunedì a venerdì"',
|
||||
'Office hours: "orari" or "ufficio"',
|
||||
],
|
||||
expectations: {
|
||||
minSchedules: 10,
|
||||
maxSchedules: 25,
|
||||
shouldHaveDays: [0, 1, 2, 3, 4, 5, 6], // All days likely
|
||||
},
|
||||
},
|
||||
|
||||
// SPANISH - Day ranges with "a"
|
||||
{
|
||||
name: 'Sagrada Família, Barcelona (ES)',
|
||||
url: 'https://sagradafamilia.org/',
|
||||
country: 'ES',
|
||||
language: 'Spanish',
|
||||
edgeCases: [
|
||||
'Day ranges: "de lunes a viernes"',
|
||||
'Office hours: "horario de oficina"',
|
||||
],
|
||||
expectations: {
|
||||
minSchedules: 5,
|
||||
maxSchedules: 15,
|
||||
},
|
||||
knownIssues: [
|
||||
'Tourist site, may have non-standard schedule format',
|
||||
'Some days showing only 1-2 masses',
|
||||
],
|
||||
},
|
||||
|
||||
// CZECH - Minimal schedules
|
||||
{
|
||||
name: 'Chrám sv. Víta, Prague (CZ)',
|
||||
url: 'https://www.katedralasvatehovita.cz/',
|
||||
country: 'CZ',
|
||||
language: 'Czech',
|
||||
edgeCases: [
|
||||
'Czech day names and time formats',
|
||||
'Limited schedule (cathedral, not parish)',
|
||||
],
|
||||
expectations: {
|
||||
minSchedules: 1,
|
||||
maxSchedules: 10,
|
||||
},
|
||||
},
|
||||
|
||||
// HUNGARIAN - Suffix-based day ranges
|
||||
{
|
||||
name: 'Szent István Bazilika, Budapest (HU)',
|
||||
url: 'https://www.bazilika.biz/',
|
||||
country: 'HU',
|
||||
language: 'Hungarian',
|
||||
edgeCases: [
|
||||
'Hungarian day names',
|
||||
'Day range suffixes: "-tól", "-től"',
|
||||
'Limited weekday schedule',
|
||||
],
|
||||
expectations: {
|
||||
minSchedules: 3,
|
||||
maxSchedules: 10,
|
||||
shouldHaveDays: [1, 2, 3, 4, 5], // Weekdays
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
interface TestResult {
|
||||
name: string;
|
||||
passed: boolean;
|
||||
scheduleCount: number;
|
||||
issues: string[];
|
||||
edgeCasesValidated: string[];
|
||||
}
|
||||
|
||||
async function runEdgeCaseTest(test: EdgeCaseTest, scraper: GenericScraper): Promise<TestResult> {
|
||||
const result: TestResult = {
|
||||
name: test.name,
|
||||
passed: true,
|
||||
scheduleCount: 0,
|
||||
issues: [],
|
||||
edgeCasesValidated: [],
|
||||
};
|
||||
|
||||
try {
|
||||
scraper.setCountry(test.country);
|
||||
const scrapeResult = await scraper.scrape(test.url);
|
||||
|
||||
if (!scrapeResult.success) {
|
||||
result.passed = false;
|
||||
result.issues.push(`Scrape failed: ${scrapeResult.error}`);
|
||||
return result;
|
||||
}
|
||||
|
||||
result.scheduleCount = scrapeResult.schedules.length;
|
||||
|
||||
// Validate schedule count
|
||||
if (test.expectations.minSchedules && result.scheduleCount < test.expectations.minSchedules) {
|
||||
result.passed = false;
|
||||
result.issues.push(
|
||||
`Too few schedules: ${result.scheduleCount} < ${test.expectations.minSchedules}`
|
||||
);
|
||||
}
|
||||
|
||||
if (test.expectations.maxSchedules && result.scheduleCount > test.expectations.maxSchedules) {
|
||||
result.passed = false;
|
||||
result.issues.push(
|
||||
`Too many schedules: ${result.scheduleCount} > ${test.expectations.maxSchedules}`
|
||||
);
|
||||
}
|
||||
|
||||
// Validate days covered
|
||||
if (test.expectations.shouldHaveDays) {
|
||||
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
const foundDays = new Set(scrapeResult.schedules.map(s => s.dayOfWeek));
|
||||
for (const day of test.expectations.shouldHaveDays) {
|
||||
if (!foundDays.has(day)) {
|
||||
result.passed = false;
|
||||
result.issues.push(`Missing expected day: ${dayNames[day]}`);
|
||||
} else {
|
||||
result.edgeCasesValidated.push(`✓ Found ${dayNames[day]}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Validate invalid times are NOT present
|
||||
if (test.expectations.shouldNotHaveTimes) {
|
||||
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
|
||||
for (const time of test.expectations.shouldNotHaveTimes) {
|
||||
if (foundTimes.has(time)) {
|
||||
result.passed = false;
|
||||
result.issues.push(`Found invalid time that should be filtered: ${time}`);
|
||||
} else {
|
||||
result.edgeCasesValidated.push(`✓ Filtered out ${time}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Validate expected times ARE present
|
||||
if (test.expectations.shouldHaveTimes) {
|
||||
const foundTimes = new Set(scrapeResult.schedules.map(s => s.time));
|
||||
for (const time of test.expectations.shouldHaveTimes) {
|
||||
if (!foundTimes.has(time)) {
|
||||
result.passed = false;
|
||||
result.issues.push(`Missing expected time: ${time}`);
|
||||
} else {
|
||||
result.edgeCasesValidated.push(`✓ Found ${time}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for duplicates (should be none after deduplication)
|
||||
const uniqueKeys = new Set<string>();
|
||||
const duplicates: string[] = [];
|
||||
for (const schedule of scrapeResult.schedules) {
|
||||
const key = `${schedule.dayOfWeek}-${schedule.time}`;
|
||||
if (uniqueKeys.has(key)) {
|
||||
duplicates.push(key);
|
||||
} else {
|
||||
uniqueKeys.add(key);
|
||||
}
|
||||
}
|
||||
|
||||
if (duplicates.length > 0) {
|
||||
result.passed = false;
|
||||
result.issues.push(`Found ${duplicates.length} duplicate schedules: ${duplicates.join(', ')}`);
|
||||
} else {
|
||||
result.edgeCasesValidated.push('✓ No duplicates');
|
||||
}
|
||||
|
||||
// Check for invalid early morning times (00:00-04:59)
|
||||
const invalidTimes = scrapeResult.schedules.filter(s => {
|
||||
const [hours] = s.time.split(':').map(Number);
|
||||
return hours >= 0 && hours <= 4;
|
||||
});
|
||||
|
||||
if (invalidTimes.length > 0) {
|
||||
result.passed = false;
|
||||
result.issues.push(
|
||||
`Found ${invalidTimes.length} invalid early morning times: ${invalidTimes.map(t => t.time).join(', ')}`
|
||||
);
|
||||
} else {
|
||||
result.edgeCasesValidated.push('✓ No invalid times (00:00-04:59)');
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
result.passed = false;
|
||||
result.issues.push(`Exception: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('🧪 EDGE CASE TEST SUITE FOR INTERNATIONAL MASS SCRAPER');
|
||||
console.log('='.repeat(80));
|
||||
console.log('');
|
||||
|
||||
const scraper = new GenericScraper();
|
||||
await scraper.init();
|
||||
|
||||
const results: TestResult[] = [];
|
||||
let passCount = 0;
|
||||
let failCount = 0;
|
||||
|
||||
for (const test of edgeCaseTests) {
|
||||
console.log(`\n📍 Testing: ${test.name} (${test.language})`);
|
||||
console.log(` URL: ${test.url}`);
|
||||
console.log(` Edge cases to validate:`);
|
||||
for (const edgeCase of test.edgeCases) {
|
||||
console.log(` • ${edgeCase}`);
|
||||
}
|
||||
|
||||
const result = await runEdgeCaseTest(test, scraper);
|
||||
results.push(result);
|
||||
|
||||
if (result.passed) {
|
||||
passCount++;
|
||||
console.log(`\n ✅ PASSED (${result.scheduleCount} schedules)`);
|
||||
} else {
|
||||
failCount++;
|
||||
console.log(`\n ❌ FAILED (${result.scheduleCount} schedules)`);
|
||||
}
|
||||
|
||||
if (result.edgeCasesValidated.length > 0) {
|
||||
console.log(`\n Edge cases validated:`);
|
||||
for (const validation of result.edgeCasesValidated) {
|
||||
console.log(` ${validation}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.issues.length > 0) {
|
||||
console.log(`\n ⚠️ Issues:`);
|
||||
for (const issue of result.issues) {
|
||||
console.log(` • ${issue}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (test.knownIssues && test.knownIssues.length > 0) {
|
||||
console.log(`\n ℹ️ Known issues:`);
|
||||
for (const issue of test.knownIssues) {
|
||||
console.log(` • ${issue}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Brief delay between tests
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
}
|
||||
|
||||
await scraper.close();
|
||||
|
||||
// Summary
|
||||
console.log('\n\n' + '='.repeat(80));
|
||||
console.log('📊 TEST SUMMARY');
|
||||
console.log('='.repeat(80));
|
||||
console.log(`Total tests: ${results.length}`);
|
||||
console.log(`✅ Passed: ${passCount}`);
|
||||
console.log(`❌ Failed: ${failCount}`);
|
||||
console.log(`Success rate: ${((passCount / results.length) * 100).toFixed(1)}%`);
|
||||
|
||||
// Detailed results table
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
console.log('Test | Status | Schedules | Issues');
|
||||
console.log('-'.repeat(80));
|
||||
for (const result of results) {
|
||||
const status = result.passed ? '✅ PASS' : '❌ FAIL';
|
||||
const name = result.name.padEnd(33);
|
||||
const schedules = result.scheduleCount.toString().padStart(9);
|
||||
const issues = result.issues.length.toString();
|
||||
console.log(`${name} | ${status} | ${schedules} | ${issues}`);
|
||||
}
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Edge case coverage summary
|
||||
console.log('\n📋 EDGE CASE COVERAGE:');
|
||||
console.log('');
|
||||
console.log('1. Day Range Expansion:');
|
||||
console.log(' ✓ Polish: "wtorek - sobota"');
|
||||
console.log(' ✓ German: "montag bis donnerstag"');
|
||||
console.log(' ✓ Italian: "da lunedì a venerdì"');
|
||||
console.log(' ✓ Spanish: "de lunes a viernes"');
|
||||
console.log('');
|
||||
console.log('2. Office Hours Filtering:');
|
||||
console.log(' ✓ German: "öffnungszeiten im pfarrbüro"');
|
||||
console.log(' ✓ Polish: "kancelaria czynna"');
|
||||
console.log(' ✓ Spanish: "horario de oficina"');
|
||||
console.log(' ✓ Italian: "orari" / "ufficio"');
|
||||
console.log('');
|
||||
console.log('3. Short Abbreviation Word Boundaries:');
|
||||
console.log(' ✓ Polish: "pn", "cz", "n" (prevented false matches)');
|
||||
console.log('');
|
||||
console.log('4. Invalid Time Filtering:');
|
||||
console.log(' ✓ Filtered: 00:00-04:59 (unrealistic mass times)');
|
||||
console.log(' ✓ German "00 uhr" fragments filtered');
|
||||
console.log('');
|
||||
console.log('5. Deduplication:');
|
||||
console.log(' ✓ Same day+time appearing multiple times on page');
|
||||
console.log('');
|
||||
console.log('6. "Closed" Notice Filtering:');
|
||||
console.log(' ✓ Polish: "nieczynna: niedziela, poniedziałek"');
|
||||
console.log(' ✓ Multi-language: fermé, cerrado, geschlossen, chiuso');
|
||||
console.log('');
|
||||
console.log('7. Time Format Support:');
|
||||
console.log(' ✓ AM/PM: "8:30 AM", "8 PM"');
|
||||
console.log(' ✓ 24-hour: "18:00", "8:30"');
|
||||
console.log(' ✓ French/Portuguese: "18h30", "8h"');
|
||||
console.log(' ✓ German: "17 Uhr", "17:00 Uhr"');
|
||||
console.log(' ✓ Italian: "18.30"');
|
||||
console.log(' ✓ Polish: "8 00", "18 00"');
|
||||
|
||||
process.exit(failCount > 0 ? 1 : 0);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
152
scripts/test-scraper.ts
Normal file
152
scripts/test-scraper.ts
Normal file
@@ -0,0 +1,152 @@
|
||||
import { GenericScraper } from '../src/scrapers/strategies/generic';
|
||||
import { getScraper } from '../src/scrapers/registry';
|
||||
import type { BaseScraper, ScrapeResult } from '../src/scrapers/base-scraper';
|
||||
|
||||
const TEST_URL = process.argv[2] || 'https://www.saintpatrickscathedral.org/masses';
|
||||
|
||||
// Parse --country flag from CLI args
|
||||
const countryFlagIndex = process.argv.indexOf('--country');
|
||||
const COUNTRY_CODE = countryFlagIndex !== -1 ? process.argv[countryFlagIndex + 1] : null;
|
||||
|
||||
// Parse --lang flag from CLI args (e.g., --lang english)
|
||||
const langFlagIndex = process.argv.indexOf('--lang');
|
||||
const LANG = langFlagIndex !== -1 ? process.argv[langFlagIndex + 1] : null;
|
||||
|
||||
const DAY_NAMES = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
|
||||
|
||||
async function main() {
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('NEARESTMASS SCRAPER TEST');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`\nURL: ${TEST_URL}`);
|
||||
console.log(`Country: ${COUNTRY_CODE || '(auto-detect from <html lang>)'}`);
|
||||
console.log(`Scraper: ${LANG || 'generic'}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
|
||||
let scraper: BaseScraper;
|
||||
|
||||
if (LANG) {
|
||||
scraper = getScraper(LANG);
|
||||
console.log(`\n Using ${LANG} scraper`);
|
||||
} else {
|
||||
scraper = new GenericScraper();
|
||||
}
|
||||
|
||||
try {
|
||||
console.log('\n[1/4] Initializing browser...');
|
||||
await scraper.init();
|
||||
console.log(' ✓ Browser ready');
|
||||
|
||||
if (COUNTRY_CODE && scraper instanceof GenericScraper) {
|
||||
scraper.setCountry(COUNTRY_CODE);
|
||||
console.log(` Country set to: ${COUNTRY_CODE}`);
|
||||
}
|
||||
|
||||
console.log('\n[2/4] Fetching page...');
|
||||
const startTime = Date.now();
|
||||
const result: ScrapeResult = await scraper.scrape(TEST_URL);
|
||||
const elapsed = Date.now() - startTime;
|
||||
console.log(` ✓ Page loaded in ${elapsed}ms`);
|
||||
|
||||
console.log('\n[3/4] Parsing results...');
|
||||
console.log(` Status: ${result.success ? '✓ SUCCESS' : '✗ FAILED'}`);
|
||||
console.log(` Schedules found: ${result.schedules.length}`);
|
||||
|
||||
if (result.detectedLanguage) {
|
||||
console.log(` Detected language: ${result.detectedLanguage}`);
|
||||
}
|
||||
|
||||
if (result.churchData) {
|
||||
console.log('\n Church Data:');
|
||||
if (result.churchData.phone) console.log(` Phone: ${result.churchData.phone}`);
|
||||
if (result.churchData.email) console.log(` Email: ${result.churchData.email}`);
|
||||
if (result.churchData.pastorName) console.log(` Pastor: ${result.churchData.pastorName}`);
|
||||
if (result.churchData.diocese) console.log(` Diocese: ${result.churchData.diocese}`);
|
||||
}
|
||||
|
||||
if (result.error) {
|
||||
console.log(` Error: ${result.error}`);
|
||||
}
|
||||
|
||||
if (result.schedules.length > 0) {
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
console.log('PARSED MASS SCHEDULES');
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
const byDay: Record<number, typeof result.schedules> = {};
|
||||
for (const schedule of result.schedules) {
|
||||
if (!byDay[schedule.dayOfWeek]) {
|
||||
byDay[schedule.dayOfWeek] = [];
|
||||
}
|
||||
byDay[schedule.dayOfWeek].push(schedule);
|
||||
}
|
||||
|
||||
for (let day = 0; day < 7; day++) {
|
||||
const schedules = byDay[day];
|
||||
if (schedules && schedules.length > 0) {
|
||||
console.log(`\n${DAY_NAMES[day]}:`);
|
||||
for (const s of schedules) {
|
||||
const parts = [
|
||||
` ${s.time}`,
|
||||
s.language && s.language !== 'English' ? `(${s.language})` : '',
|
||||
s.massType ? `[${s.massType}]` : '',
|
||||
s.notes ? `- ${s.notes}` : '',
|
||||
].filter(Boolean);
|
||||
console.log(parts.join(' '));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result.rawHtml) {
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
console.log('RAW TEXT PREVIEW (first 1000 chars, stripped of HTML)');
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
const textOnly = result.rawHtml
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/[\u2013\u2014]/g, '-')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
.substring(0, 1000);
|
||||
|
||||
console.log('\n' + textOnly);
|
||||
|
||||
if (result.rawHtml.length > 1000) {
|
||||
console.log('\n... (truncated)');
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('SUMMARY');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`URL: ${TEST_URL}`);
|
||||
console.log(`Scraper: ${LANG || 'generic'}`);
|
||||
console.log(`Country: ${COUNTRY_CODE || '(auto-detected)'}`);
|
||||
console.log(`Language: ${result.detectedLanguage || '(unknown)'}`);
|
||||
console.log(`Success: ${result.success ? 'Yes' : 'No'}`);
|
||||
console.log(`Schedules: ${result.schedules.length}`);
|
||||
console.log(`HTML Size: ${result.rawHtml ? Math.round(result.rawHtml.length / 1024) + ' KB' : 'N/A'}`);
|
||||
|
||||
if (result.schedules.length > 0) {
|
||||
const days = [...new Set(result.schedules.map(s => s.dayOfWeek))];
|
||||
const languages = [...new Set(result.schedules.map(s => s.language || 'English'))];
|
||||
console.log(`Days: ${days.map(d => DAY_NAMES[d]).join(', ')}`);
|
||||
console.log(`Languages: ${languages.join(', ')}`);
|
||||
}
|
||||
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
} catch (error) {
|
||||
console.error('\n[ERROR]', error);
|
||||
} finally {
|
||||
console.log('[4/4] Closing browser...');
|
||||
await scraper.close();
|
||||
console.log(' ✓ Done\n');
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
135
scripts/test-url-discovery.ts
Normal file
135
scripts/test-url-discovery.ts
Normal file
@@ -0,0 +1,135 @@
|
||||
import { discoverMassScheduleUrl } from '../src/scrapers/url-discovery';
|
||||
|
||||
const TEST_SITES = [
|
||||
'https://www.saintpatrickscathedral.org',
|
||||
'https://www.holynamecathedral.org',
|
||||
'https://www.olacathedral.org',
|
||||
];
|
||||
|
||||
const CONFIDENCE_ICONS: Record<string, string> = {
|
||||
high: '🟢',
|
||||
medium: '🟡',
|
||||
low: '🔴',
|
||||
};
|
||||
|
||||
const METHOD_DESCRIPTIONS: Record<string, string> = {
|
||||
pattern: 'Found via URL pattern matching',
|
||||
link: 'Found via link crawling',
|
||||
homepage: 'Fell back to homepage',
|
||||
};
|
||||
|
||||
async function testSingleUrl(url: string) {
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('NEARESTMASS URL DISCOVERY TEST');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`\nURL: ${url}`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
|
||||
console.log('\n[1/2] Discovering mass schedule URL...');
|
||||
const startTime = Date.now();
|
||||
const result = await discoverMassScheduleUrl(url);
|
||||
const elapsed = Date.now() - startTime;
|
||||
console.log(` ✓ Discovery completed in ${elapsed}ms`);
|
||||
|
||||
console.log('\n[2/2] Results:');
|
||||
console.log(` Discovered URL: ${result.url}`);
|
||||
console.log(` Method: ${result.method} (${METHOD_DESCRIPTIONS[result.method]})`);
|
||||
console.log(` Confidence: ${CONFIDENCE_ICONS[result.confidence]} ${result.confidence}`);
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('SUMMARY');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Input: ${url}`);
|
||||
console.log(`Output: ${result.url}`);
|
||||
console.log(`Method: ${result.method}`);
|
||||
console.log(`Confidence: ${result.confidence}`);
|
||||
console.log(`Time: ${elapsed}ms`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
}
|
||||
|
||||
async function testMultipleSites() {
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('NEARESTMASS URL DISCOVERY TEST (BATCH)');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`\nTesting ${TEST_SITES.length} sites...`);
|
||||
console.log(`Time: ${new Date().toISOString()}`);
|
||||
|
||||
const results: Array<{
|
||||
site: string;
|
||||
url: string;
|
||||
method: string;
|
||||
confidence: string;
|
||||
elapsed: number;
|
||||
}> = [];
|
||||
|
||||
for (let i = 0; i < TEST_SITES.length; i++) {
|
||||
const site = TEST_SITES[i];
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
console.log(`[${i + 1}/${TEST_SITES.length}] Testing: ${site}`);
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
const startTime = Date.now();
|
||||
const result = await discoverMassScheduleUrl(site);
|
||||
const elapsed = Date.now() - startTime;
|
||||
|
||||
console.log(`\n Discovered URL: ${result.url}`);
|
||||
console.log(` Method: ${result.method} (${METHOD_DESCRIPTIONS[result.method]})`);
|
||||
console.log(` Confidence: ${CONFIDENCE_ICONS[result.confidence]} ${result.confidence}`);
|
||||
console.log(` Time: ${elapsed}ms`);
|
||||
|
||||
results.push({
|
||||
site,
|
||||
url: result.url,
|
||||
method: result.method,
|
||||
confidence: result.confidence,
|
||||
elapsed,
|
||||
});
|
||||
|
||||
// Rate limiting between sites
|
||||
if (i < TEST_SITES.length - 1) {
|
||||
console.log('\n Waiting 2s before next site...');
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
}
|
||||
}
|
||||
|
||||
// Summary table
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('SUMMARY');
|
||||
console.log('='.repeat(70));
|
||||
|
||||
const highCount = results.filter((r) => r.confidence === 'high').length;
|
||||
const mediumCount = results.filter((r) => r.confidence === 'medium').length;
|
||||
const lowCount = results.filter((r) => r.confidence === 'low').length;
|
||||
const totalTime = results.reduce((sum, r) => sum + r.elapsed, 0);
|
||||
|
||||
console.log(`\nSites tested: ${results.length}`);
|
||||
console.log(`High conf: ${highCount} 🟢`);
|
||||
console.log(`Medium conf: ${mediumCount} 🟡`);
|
||||
console.log(`Low conf: ${lowCount} 🔴`);
|
||||
console.log(`Total time: ${totalTime}ms`);
|
||||
|
||||
console.log('\n' + '-'.repeat(70));
|
||||
console.log('RESULTS BY SITE');
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
for (const r of results) {
|
||||
console.log(`\n${r.site}`);
|
||||
console.log(` → ${r.url}`);
|
||||
console.log(` ${CONFIDENCE_ICONS[r.confidence]} ${r.confidence} via ${r.method}`);
|
||||
}
|
||||
|
||||
console.log('\n' + '='.repeat(70) + '\n');
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const testUrl = process.argv[2];
|
||||
|
||||
if (testUrl) {
|
||||
await testSingleUrl(testUrl);
|
||||
} else {
|
||||
await testMultipleSites();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
323
scripts/transfer-enriched-to-neon.ts
Normal file
323
scripts/transfer-enriched-to-neon.ts
Normal file
@@ -0,0 +1,323 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Transfer enriched church data from Synology NAS to Neon production
|
||||
*
|
||||
* This script transfers ONLY churches that have been enriched or scraped
|
||||
* (have websites, phone numbers, or mass schedules) to reduce data transfer.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/transfer-enriched-to-neon.ts # Dry run
|
||||
* npx tsx scripts/transfer-enriched-to-neon.ts --execute # Actually transfer
|
||||
*/
|
||||
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { Pool } from 'pg';
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
interface TransferStats {
|
||||
churchesProcessed: number;
|
||||
churchesInserted: number;
|
||||
churchesUpdated: number;
|
||||
massSchedules: number;
|
||||
confessionSchedules: number;
|
||||
adorationSchedules: number;
|
||||
errors: number;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// Parse CLI arguments
|
||||
const args = process.argv.slice(2);
|
||||
const executeIndex = args.indexOf('--execute');
|
||||
const sinceIndex = args.indexOf('--since');
|
||||
const forceAllIndex = args.indexOf('--force-all');
|
||||
|
||||
const dryRun = executeIndex === -1;
|
||||
const sinceTimestamp = sinceIndex !== -1 && args[sinceIndex + 1]
|
||||
? new Date(args[sinceIndex + 1])
|
||||
: null;
|
||||
const forceAll = forceAllIndex !== -1;
|
||||
|
||||
console.log('════════════════════════════════════════════════════════════');
|
||||
console.log(' Transfer Enriched Data: Synology NAS → Neon Production');
|
||||
console.log('════════════════════════════════════════════════════════════\n');
|
||||
|
||||
if (dryRun) {
|
||||
console.log('🔍 DRY RUN MODE - No data will be written to Neon\n');
|
||||
} else {
|
||||
console.log('⚠️ PRODUCTION MODE - Data will be written to Neon');
|
||||
console.log('Press Ctrl+C within 5 seconds to cancel...\n');
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
}
|
||||
|
||||
if (forceAll) {
|
||||
console.log('🔄 FORCE ALL MODE - Transferring all enriched churches\n');
|
||||
} else if (sinceTimestamp) {
|
||||
console.log(`📅 INCREMENTAL MODE - Only churches modified since ${sinceTimestamp.toISOString()}\n`);
|
||||
} else {
|
||||
console.log('📅 AUTO INCREMENTAL MODE - Detecting last transfer timestamp...\n');
|
||||
}
|
||||
|
||||
// Step 1: Connect to NAS database
|
||||
console.log('[1/3] Connecting to Synology NAS database...');
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
||||
|
||||
const nasPool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const nasAdapter = new PrismaPg(nasPool);
|
||||
const nasPrisma = new PrismaClient({ adapter: nasAdapter });
|
||||
|
||||
try {
|
||||
await nasPrisma.$connect();
|
||||
const nasUrl = process.env.DATABASE_URL?.split('@')[1]?.split('/')[0] || 'unknown';
|
||||
console.log(`✅ Connected to NAS: ${nasUrl}\n`);
|
||||
|
||||
// Detect last transfer timestamp if not specified
|
||||
let transferSince: Date | null = sinceTimestamp;
|
||||
|
||||
if (!forceAll && !sinceTimestamp) {
|
||||
// Auto-detect: find the most recent lastTransferredAt across all churches
|
||||
const lastTransfer = await nasPrisma.church.findFirst({
|
||||
where: { lastTransferredAt: { not: null } },
|
||||
orderBy: { lastTransferredAt: 'desc' },
|
||||
select: { lastTransferredAt: true }
|
||||
});
|
||||
|
||||
if (lastTransfer?.lastTransferredAt) {
|
||||
transferSince = lastTransfer.lastTransferredAt;
|
||||
console.log(`✅ Last transfer detected: ${transferSince.toISOString()}`);
|
||||
console.log(` Will transfer churches modified after this time\n`);
|
||||
} else {
|
||||
console.log('ℹ️ No previous transfer detected - will transfer all enriched churches\n');
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Export enriched churches from NAS
|
||||
console.log('[2/3] Exporting enriched churches from NAS...');
|
||||
console.log('Criteria: Has website OR phone OR google_place_id OR mass schedules\n');
|
||||
|
||||
// Build WHERE clause
|
||||
const whereClause: any = {
|
||||
OR: [
|
||||
{ website: { not: null } },
|
||||
{ phone: { not: null } },
|
||||
{ googlePlaceId: { not: null } },
|
||||
{ massSchedules: { some: {} } },
|
||||
],
|
||||
NOT: { latitude: 0, longitude: 0 },
|
||||
};
|
||||
|
||||
// Add incremental filter if applicable
|
||||
if (!forceAll && transferSince) {
|
||||
whereClause.AND = { updatedAt: { gt: transferSince } };
|
||||
console.log(`🔄 Incremental filter: updatedAt > ${transferSince.toISOString()}\n`);
|
||||
}
|
||||
|
||||
const BATCH_SIZE = 100;
|
||||
const totalCount = await nasPrisma.church.count({ where: whereClause });
|
||||
|
||||
console.log(`Found ${totalCount} enriched churches (will process in batches of ${BATCH_SIZE})\n`);
|
||||
|
||||
if (totalCount === 0) {
|
||||
console.log('⚠️ No enriched churches to transfer');
|
||||
await nasPrisma.$disconnect();
|
||||
return;
|
||||
}
|
||||
|
||||
// Step 3: Import to Neon
|
||||
console.log('[3/3] Importing to Neon production database...');
|
||||
|
||||
// Load Neon credentials
|
||||
dotenv.config({ path: path.resolve(process.cwd(), '.env.production'), override: true });
|
||||
|
||||
const neonPool = new Pool({ connectionString: process.env.DATABASE_URL });
|
||||
const neonAdapter = new PrismaPg(neonPool);
|
||||
const neonPrisma = new PrismaClient({ adapter: neonAdapter });
|
||||
|
||||
try {
|
||||
await neonPrisma.$connect();
|
||||
const neonUrl = process.env.DATABASE_URL?.split('@')[1]?.split('/')[0] || 'unknown';
|
||||
console.log(`✅ Connected to Neon: ${neonUrl}\n`);
|
||||
|
||||
const stats: TransferStats = {
|
||||
churchesProcessed: 0,
|
||||
churchesInserted: 0,
|
||||
churchesUpdated: 0,
|
||||
massSchedules: 0,
|
||||
confessionSchedules: 0,
|
||||
adorationSchedules: 0,
|
||||
errors: 0,
|
||||
};
|
||||
|
||||
for (let skip = 0; skip < totalCount; skip += BATCH_SIZE) {
|
||||
const churches = await nasPrisma.church.findMany({
|
||||
where: whereClause,
|
||||
include: {
|
||||
massSchedules: true,
|
||||
confessionSchedules: true,
|
||||
adorationSchedules: true,
|
||||
},
|
||||
skip,
|
||||
take: BATCH_SIZE,
|
||||
orderBy: { id: 'asc' },
|
||||
});
|
||||
|
||||
console.log(`\nBatch ${Math.floor(skip / BATCH_SIZE) + 1}: processing ${churches.length} churches (${skip + 1}–${skip + churches.length} of ${totalCount})`);
|
||||
|
||||
for (const church of churches) {
|
||||
try {
|
||||
stats.churchesProcessed++;
|
||||
|
||||
const massSchedules = church.massSchedules || [];
|
||||
const confessionSchedules = church.confessionSchedules || [];
|
||||
const adorationSchedules = church.adorationSchedules || [];
|
||||
|
||||
// Extract church data without relations (preserve lastTransferredAt)
|
||||
const { massSchedules: _, confessionSchedules: __, adorationSchedules: ___, id, createdAt, updatedAt, lastTransferredAt, ...churchData } = church;
|
||||
|
||||
if (!dryRun) {
|
||||
// Check if church exists in Neon
|
||||
const existing = await neonPrisma.church.findFirst({
|
||||
where: {
|
||||
latitude: church.latitude,
|
||||
longitude: church.longitude,
|
||||
}
|
||||
});
|
||||
|
||||
let resultId: string;
|
||||
|
||||
if (existing) {
|
||||
// Update existing church (only overwrite if NAS has better data)
|
||||
await neonPrisma.church.update({
|
||||
where: { id: existing.id },
|
||||
data: {
|
||||
website: churchData.website || existing.website,
|
||||
phone: churchData.phone || existing.phone,
|
||||
googlePlaceId: churchData.googlePlaceId || existing.googlePlaceId,
|
||||
// Always update name, address if provided
|
||||
name: churchData.name,
|
||||
address: churchData.address || existing.address,
|
||||
city: churchData.city || existing.city,
|
||||
state: churchData.state || existing.state,
|
||||
zip: churchData.zip || existing.zip,
|
||||
massScheduleUrl: churchData.massScheduleUrl || existing.massScheduleUrl,
|
||||
lastTransferredAt: new Date(), // Mark as transferred
|
||||
}
|
||||
});
|
||||
resultId = existing.id;
|
||||
stats.churchesUpdated++;
|
||||
|
||||
// Delete old schedules
|
||||
await neonPrisma.massSchedule.deleteMany({ where: { churchId: existing.id } });
|
||||
await neonPrisma.confessionSchedule.deleteMany({ where: { churchId: existing.id } });
|
||||
await neonPrisma.adorationSchedule.deleteMany({ where: { churchId: existing.id } });
|
||||
|
||||
} else {
|
||||
// Create new church
|
||||
const newChurch = await neonPrisma.church.create({
|
||||
data: {
|
||||
...churchData,
|
||||
lastTransferredAt: new Date(), // Mark as transferred
|
||||
}
|
||||
});
|
||||
resultId = newChurch.id;
|
||||
stats.churchesInserted++;
|
||||
}
|
||||
|
||||
// Insert schedules
|
||||
for (const schedule of massSchedules) {
|
||||
const { id, createdAt, updatedAt, ...scheduleData } = schedule;
|
||||
await neonPrisma.massSchedule.create({
|
||||
data: { ...scheduleData, churchId: resultId }
|
||||
});
|
||||
stats.massSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of confessionSchedules) {
|
||||
const { id, createdAt, updatedAt, ...scheduleData } = schedule;
|
||||
await neonPrisma.confessionSchedule.create({
|
||||
data: { ...scheduleData, churchId: resultId }
|
||||
});
|
||||
stats.confessionSchedules++;
|
||||
}
|
||||
|
||||
for (const schedule of adorationSchedules) {
|
||||
const { id, createdAt, updatedAt, ...scheduleData } = schedule;
|
||||
await neonPrisma.adorationSchedule.create({
|
||||
data: { ...scheduleData, churchId: resultId }
|
||||
});
|
||||
stats.adorationSchedules++;
|
||||
}
|
||||
|
||||
// Update NAS record with transfer timestamp (after successful transfer to Neon)
|
||||
await nasPrisma.church.update({
|
||||
where: { id: church.id },
|
||||
data: { lastTransferredAt: new Date() }
|
||||
});
|
||||
} else {
|
||||
// Dry run - just count
|
||||
stats.massSchedules += massSchedules.length;
|
||||
stats.confessionSchedules += confessionSchedules.length;
|
||||
stats.adorationSchedules += adorationSchedules.length;
|
||||
}
|
||||
|
||||
if (stats.churchesProcessed % 100 === 0) {
|
||||
console.log(`Progress: ${stats.churchesProcessed}/${totalCount} churches...`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
stats.errors++;
|
||||
console.error(`Error transferring ${church.name}:`, error instanceof Error ? error.message : error);
|
||||
}
|
||||
}
|
||||
|
||||
// Brief pause between batches to avoid overwhelming Neon
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
} // end batch loop
|
||||
|
||||
console.log('\n════════════════════════════════════════════════════════════');
|
||||
console.log('Transfer Summary');
|
||||
console.log('════════════════════════════════════════════════════════════');
|
||||
if (!forceAll && transferSince) {
|
||||
console.log(`Transfer mode: Incremental (since ${transferSince.toISOString()})`);
|
||||
} else {
|
||||
console.log(`Transfer mode: Full (all enriched churches)`);
|
||||
}
|
||||
console.log(`Churches processed: ${stats.churchesProcessed}`);
|
||||
console.log(`Churches inserted: ${stats.churchesInserted}`);
|
||||
console.log(`Churches updated: ${stats.churchesUpdated}`);
|
||||
console.log(`Mass schedules: ${stats.massSchedules}`);
|
||||
console.log(`Confession schedules: ${stats.confessionSchedules}`);
|
||||
console.log(`Adoration schedules: ${stats.adorationSchedules}`);
|
||||
console.log(`Errors: ${stats.errors}`);
|
||||
console.log('════════════════════════════════════════════════════════════\n');
|
||||
|
||||
await neonPrisma.$disconnect();
|
||||
await nasPrisma.$disconnect();
|
||||
|
||||
if (dryRun) {
|
||||
console.log('💡 This was a DRY RUN. To actually transfer to Neon, run:');
|
||||
console.log(' Incremental sync (default):');
|
||||
console.log(' npx tsx scripts/transfer-enriched-to-neon.ts --execute\n');
|
||||
console.log(' Transfer all enriched churches:');
|
||||
console.log(' npx tsx scripts/transfer-enriched-to-neon.ts --execute --force-all\n');
|
||||
console.log(' Transfer since specific date:');
|
||||
console.log(' npx tsx scripts/transfer-enriched-to-neon.ts --execute --since 2026-02-01T00:00:00Z\n');
|
||||
} else {
|
||||
console.log('🎉 Data successfully transferred to Neon production!\n');
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Neon import failed:', error);
|
||||
await neonPrisma.$disconnect();
|
||||
throw error;
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Transfer failed:', error);
|
||||
await nasPrisma.$disconnect();
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
123
src/app/api/admin/freesearch-log/route.ts
Normal file
123
src/app/api/admin/freesearch-log/route.ts
Normal file
@@ -0,0 +1,123 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
import { prisma } from '@/lib/db';
|
||||
import { validateAdminApiKey, unauthorizedResponse } from '@/lib/admin-auth';
|
||||
import { Prisma } from '@prisma/client';
|
||||
|
||||
const COUNTRY_KEYWORDS: Record<string, string> = {
|
||||
FR: 'paroisse',
|
||||
DE: 'pfarrei',
|
||||
ES: 'parroquia',
|
||||
MX: 'parroquia',
|
||||
PL: 'parafia',
|
||||
BR: 'paroquia',
|
||||
PT: 'paroquia',
|
||||
IT: 'parrocchia',
|
||||
CZ: 'farnost',
|
||||
HU: 'plebania',
|
||||
AR: 'parroquia',
|
||||
CO: 'parroquia',
|
||||
EC: 'parroquia',
|
||||
PE: 'parroquia',
|
||||
CL: 'parroquia',
|
||||
VE: 'parroquia',
|
||||
CR: 'parroquia',
|
||||
SV: 'parroquia',
|
||||
GT: 'parroquia',
|
||||
CU: 'parroquia',
|
||||
PA: 'parroquia',
|
||||
BO: 'parroquia',
|
||||
HN: 'parroquia',
|
||||
BE: 'paroisse',
|
||||
LU: 'paroisse',
|
||||
CH: 'pfarrei',
|
||||
NL: 'parochie',
|
||||
SK: 'farnosť',
|
||||
SI: 'župnija',
|
||||
};
|
||||
|
||||
const STATES_COUNTRIES = new Set(['US', 'CA', 'AU', 'BR']);
|
||||
|
||||
function buildSearchQuery(church: {
|
||||
name: string;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
country: string;
|
||||
}): string {
|
||||
const parts = [`"${church.name}"`];
|
||||
if (church.city) parts.push(church.city);
|
||||
if (church.state && STATES_COUNTRIES.has(church.country)) parts.push(church.state);
|
||||
const keyword = COUNTRY_KEYWORDS[church.country];
|
||||
if (keyword) parts.push(keyword);
|
||||
parts.push('official website');
|
||||
return parts.join(' ');
|
||||
}
|
||||
|
||||
// GET /api/admin/freesearch-log — List FreeSearch results
|
||||
export async function GET(request: NextRequest) {
|
||||
if (!validateAdminApiKey(request)) return unauthorizedResponse();
|
||||
|
||||
try {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const filter = searchParams.get('filter') || 'all';
|
||||
const country = searchParams.get('country');
|
||||
const limit = Math.min(parseInt(searchParams.get('limit') || '50'), 200);
|
||||
const offset = parseInt(searchParams.get('offset') || '0');
|
||||
|
||||
const where: Prisma.ChurchWhereInput = {
|
||||
freeSearchedAt: { not: null },
|
||||
};
|
||||
|
||||
if (filter === 'found') {
|
||||
where.hasWebsite = true;
|
||||
where.website = { not: null };
|
||||
} else if (filter === 'not-found') {
|
||||
where.OR = [{ hasWebsite: false }, { website: null }];
|
||||
}
|
||||
|
||||
if (country) {
|
||||
where.country = country;
|
||||
}
|
||||
|
||||
const [results, total] = await Promise.all([
|
||||
prisma.church.findMany({
|
||||
where,
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
city: true,
|
||||
state: true,
|
||||
country: true,
|
||||
freeSearchedAt: true,
|
||||
hasWebsite: true,
|
||||
website: true,
|
||||
},
|
||||
orderBy: { freeSearchedAt: 'desc' },
|
||||
take: limit,
|
||||
skip: offset,
|
||||
}),
|
||||
prisma.church.count({ where }),
|
||||
]);
|
||||
|
||||
return NextResponse.json({
|
||||
results: results.map((c) => ({
|
||||
id: c.id,
|
||||
name: c.name,
|
||||
city: c.city,
|
||||
country: c.country,
|
||||
searchQuery: buildSearchQuery(c),
|
||||
freeSearchedAt: c.freeSearchedAt,
|
||||
found: c.hasWebsite && c.website !== null,
|
||||
website: c.website,
|
||||
})),
|
||||
total,
|
||||
limit,
|
||||
offset,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching freesearch log:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to fetch freesearch log' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
30
src/app/api/admin/jobs/[jobId]/route.ts
Normal file
30
src/app/api/admin/jobs/[jobId]/route.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
import { prisma } from '@/lib/db';
|
||||
import { validateAdminApiKey, unauthorizedResponse } from '@/lib/admin-auth';
|
||||
|
||||
// GET /api/admin/jobs/[jobId] — Get detailed job status
|
||||
export async function GET(
|
||||
request: NextRequest,
|
||||
{ params }: { params: Promise<{ jobId: string }> }
|
||||
) {
|
||||
if (!validateAdminApiKey(request)) return unauthorizedResponse();
|
||||
|
||||
try {
|
||||
const { jobId } = await params;
|
||||
const job = await prisma.backgroundJob.findUnique({
|
||||
where: { id: jobId },
|
||||
});
|
||||
|
||||
if (!job) {
|
||||
return NextResponse.json({ error: 'Job not found' }, { status: 404 });
|
||||
}
|
||||
|
||||
return NextResponse.json({ job });
|
||||
} catch (error) {
|
||||
console.error('Error fetching job:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to fetch job' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
155
src/app/api/admin/jobs/route.ts
Normal file
155
src/app/api/admin/jobs/route.ts
Normal file
@@ -0,0 +1,155 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
import { prisma } from '@/lib/db';
|
||||
import { validateAdminApiKey, unauthorizedResponse } from '@/lib/admin-auth';
|
||||
|
||||
// GET /api/admin/jobs — List all background jobs + church stats
|
||||
export async function GET(request: NextRequest) {
|
||||
if (!validateAdminApiKey(request)) return unauthorizedResponse();
|
||||
|
||||
try {
|
||||
// Get all jobs (most recent first)
|
||||
const jobs = await prisma.backgroundJob.findMany({
|
||||
orderBy: { createdAt: 'desc' },
|
||||
take: 50,
|
||||
});
|
||||
|
||||
// Church database stats
|
||||
const [
|
||||
totalChurches,
|
||||
withWebsites,
|
||||
scraped,
|
||||
withSchedules,
|
||||
googlePlacesEnriched,
|
||||
freeSearchSearched,
|
||||
freeSearchFound,
|
||||
] = await Promise.all([
|
||||
prisma.church.count(),
|
||||
prisma.church.count({ where: { hasWebsite: true } }),
|
||||
prisma.church.count({ where: { lastScrapedAt: { not: null } } }),
|
||||
prisma.church.count({
|
||||
where: { massSchedules: { some: {} } },
|
||||
}),
|
||||
prisma.church.count({ where: { googlePlaceId: { not: null } } }),
|
||||
prisma.church.count({ where: { freeSearchedAt: { not: null } } }),
|
||||
prisma.church.count({
|
||||
where: {
|
||||
freeSearchedAt: { not: null },
|
||||
hasWebsite: true,
|
||||
},
|
||||
}),
|
||||
]);
|
||||
|
||||
// Language breakdown
|
||||
const languageGroups = await prisma.church.groupBy({
|
||||
by: ['websiteLanguage'],
|
||||
_count: { id: true },
|
||||
where: { websiteLanguage: { not: null } },
|
||||
orderBy: { _count: { id: 'desc' } },
|
||||
});
|
||||
|
||||
const byLanguage: Record<string, number> = {};
|
||||
for (const g of languageGroups) {
|
||||
if (g.websiteLanguage) {
|
||||
byLanguage[g.websiteLanguage] = g._count.id;
|
||||
}
|
||||
}
|
||||
|
||||
return NextResponse.json({
|
||||
jobs,
|
||||
stats: {
|
||||
totalChurches,
|
||||
withWebsites,
|
||||
scraped,
|
||||
withSchedules,
|
||||
byLanguage,
|
||||
enrichment: {
|
||||
googlePlacesEnriched,
|
||||
freeSearchSearched,
|
||||
freeSearchFound,
|
||||
},
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching jobs:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to fetch jobs' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// POST /api/admin/jobs — Create a new pending job
|
||||
export async function POST(request: NextRequest) {
|
||||
if (!validateAdminApiKey(request)) return unauthorizedResponse();
|
||||
|
||||
try {
|
||||
const body = await request.json();
|
||||
const { type, language, config } = body;
|
||||
|
||||
if (!type || !['scraper', 'freesearch-enrichment', 'reverse-geocode-enrichment'].includes(type)) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Invalid job type. Must be: scraper, freesearch-enrichment, or reverse-geocode-enrichment' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
const job = await prisma.backgroundJob.create({
|
||||
data: {
|
||||
type,
|
||||
language: language || null,
|
||||
status: 'pending',
|
||||
config: config || null,
|
||||
},
|
||||
});
|
||||
|
||||
return NextResponse.json({ job }, { status: 201 });
|
||||
} catch (error) {
|
||||
console.error('Error creating job:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to create job' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// PATCH /api/admin/jobs — Stop a running job
|
||||
export async function PATCH(request: NextRequest) {
|
||||
if (!validateAdminApiKey(request)) return unauthorizedResponse();
|
||||
|
||||
try {
|
||||
const body = await request.json();
|
||||
const { jobId, action } = body;
|
||||
|
||||
if (!jobId || action !== 'stop') {
|
||||
return NextResponse.json(
|
||||
{ error: 'Must provide jobId and action: "stop"' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
||||
if (!job) {
|
||||
return NextResponse.json({ error: 'Job not found' }, { status: 404 });
|
||||
}
|
||||
|
||||
if (job.status !== 'running') {
|
||||
return NextResponse.json(
|
||||
{ error: `Cannot stop job with status: ${job.status}` },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
const updated = await prisma.backgroundJob.update({
|
||||
where: { id: jobId },
|
||||
data: { status: 'stopping' },
|
||||
});
|
||||
|
||||
return NextResponse.json({ job: updated });
|
||||
} catch (error) {
|
||||
console.error('Error stopping job:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to stop job' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
86
src/app/api/admin/scrape-log/route.ts
Normal file
86
src/app/api/admin/scrape-log/route.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
import { prisma } from '@/lib/db';
|
||||
import { validateAdminApiKey, unauthorizedResponse } from '@/lib/admin-auth';
|
||||
import { Prisma } from '@prisma/client';
|
||||
|
||||
// GET /api/admin/scrape-log — List recently scraped churches
|
||||
export async function GET(request: NextRequest) {
|
||||
if (!validateAdminApiKey(request)) return unauthorizedResponse();
|
||||
|
||||
try {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const filter = searchParams.get('filter') || 'all';
|
||||
const language = searchParams.get('language');
|
||||
const limit = Math.min(parseInt(searchParams.get('limit') || '50'), 200);
|
||||
const offset = parseInt(searchParams.get('offset') || '0');
|
||||
|
||||
const where: Prisma.ChurchWhereInput = {
|
||||
lastScrapedAt: { not: null },
|
||||
};
|
||||
|
||||
if (filter === 'success') {
|
||||
where.massSchedules = { some: {} };
|
||||
} else if (filter === 'failed') {
|
||||
where.massSchedules = { none: {} };
|
||||
}
|
||||
|
||||
if (language) {
|
||||
where.websiteLanguage = language;
|
||||
}
|
||||
|
||||
const [results, total] = await Promise.all([
|
||||
prisma.church.findMany({
|
||||
where,
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
website: true,
|
||||
massScheduleUrl: true,
|
||||
country: true,
|
||||
city: true,
|
||||
websiteLanguage: true,
|
||||
lastScrapedAt: true,
|
||||
scraperConfig: {
|
||||
select: {
|
||||
strategyName: true,
|
||||
failureCount: true,
|
||||
},
|
||||
},
|
||||
_count: {
|
||||
select: { massSchedules: true },
|
||||
},
|
||||
},
|
||||
orderBy: { lastScrapedAt: 'desc' },
|
||||
take: limit,
|
||||
skip: offset,
|
||||
}),
|
||||
prisma.church.count({ where }),
|
||||
]);
|
||||
|
||||
return NextResponse.json({
|
||||
results: results.map((c) => ({
|
||||
id: c.id,
|
||||
name: c.name,
|
||||
website: c.website,
|
||||
massScheduleUrl: c.massScheduleUrl,
|
||||
country: c.country,
|
||||
city: c.city,
|
||||
websiteLanguage: c.websiteLanguage,
|
||||
lastScrapedAt: c.lastScrapedAt,
|
||||
strategy: c.scraperConfig?.strategyName || 'generic',
|
||||
failureCount: c.scraperConfig?.failureCount || 0,
|
||||
scheduleCount: c._count.massSchedules,
|
||||
success: c._count.massSchedules > 0,
|
||||
})),
|
||||
total,
|
||||
limit,
|
||||
offset,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching scrape log:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to fetch scrape log' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
113
src/app/api/admin/scraper-health/route.ts
Normal file
113
src/app/api/admin/scraper-health/route.ts
Normal file
@@ -0,0 +1,113 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
import { prisma } from '@/lib/db';
|
||||
import { validateAdminApiKey, unauthorizedResponse } from '@/lib/admin-auth';
|
||||
import { buildLanguageFilter } from '@/lib/scraper-service';
|
||||
|
||||
const LANGUAGES = [
|
||||
'english', 'french', 'spanish', 'italian', 'german',
|
||||
'polish', 'portuguese', 'dutch', 'czech', 'hungarian', 'generic',
|
||||
];
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
const hours = Math.floor(ms / 3_600_000);
|
||||
const minutes = Math.floor((ms % 3_600_000) / 60_000);
|
||||
if (hours > 0) return `${hours}h ${minutes}m`;
|
||||
return `${minutes}m`;
|
||||
}
|
||||
|
||||
// GET /api/admin/scraper-health — Quick health check for scraper pipeline
|
||||
export async function GET(request: NextRequest) {
|
||||
if (!validateAdminApiKey(request)) return unauthorizedResponse();
|
||||
|
||||
try {
|
||||
const now = Date.now();
|
||||
const thirtyDaysAgo = new Date(now - 30 * 24 * 60 * 60 * 1000);
|
||||
|
||||
// --- Throughput: count churches scraped in last 1h, 6h, 24h ---
|
||||
const throughputPromise = Promise.all([
|
||||
prisma.church.count({ where: { lastScrapedAt: { gte: new Date(now - 1 * 3_600_000) } } }),
|
||||
prisma.church.count({ where: { lastScrapedAt: { gte: new Date(now - 6 * 3_600_000) } } }),
|
||||
prisma.church.count({ where: { lastScrapedAt: { gte: new Date(now - 24 * 3_600_000) } } }),
|
||||
]);
|
||||
|
||||
// --- Running jobs ---
|
||||
const runningJobsPromise = prisma.backgroundJob.findMany({
|
||||
where: { status: 'running', type: 'scraper' },
|
||||
select: { id: true, type: true, language: true, startedAt: true, processed: true },
|
||||
});
|
||||
|
||||
// --- Per-language queue counts ---
|
||||
const baseWhere = {
|
||||
claimed: false,
|
||||
website: { not: null },
|
||||
OR: [
|
||||
{ lastScrapedAt: null },
|
||||
{ lastScrapedAt: { lt: thirtyDaysAgo } },
|
||||
],
|
||||
AND: [
|
||||
{
|
||||
OR: [
|
||||
{ scraperConfig: null },
|
||||
{ scraperConfig: { failureCount: { lt: 5 } } },
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const queuePromises = LANGUAGES.map(async (lang) => {
|
||||
const filter = buildLanguageFilter(lang);
|
||||
const count = await prisma.church.count({
|
||||
where: {
|
||||
...baseWhere,
|
||||
AND: [...(baseWhere.AND as object[]), ...(filter ? [filter] : [])],
|
||||
},
|
||||
});
|
||||
return [lang, count] as const;
|
||||
});
|
||||
|
||||
// Run all queries concurrently
|
||||
const [[last1h, last6h, last24h], runningJobs, queueResults] = await Promise.all([
|
||||
throughputPromise,
|
||||
runningJobsPromise,
|
||||
Promise.all(queuePromises),
|
||||
]);
|
||||
|
||||
const queue: Record<string, number> = {};
|
||||
for (const [lang, count] of queueResults) {
|
||||
if (count > 0) queue[lang] = count;
|
||||
}
|
||||
|
||||
// Format running jobs
|
||||
const jobs = runningJobs.map((job) => ({
|
||||
id: job.id,
|
||||
type: job.type,
|
||||
language: job.language,
|
||||
startedAt: job.startedAt,
|
||||
runningFor: job.startedAt ? formatDuration(now - job.startedAt.getTime()) : null,
|
||||
processed: job.processed,
|
||||
}));
|
||||
|
||||
// Health check: unhealthy if any scraper running >6h with zero throughput in last hour
|
||||
const hasStuckJob = runningJobs.some(
|
||||
(job) => job.startedAt && (now - job.startedAt.getTime()) > 6 * 3_600_000
|
||||
);
|
||||
const healthy = !(hasStuckJob && last6h === 0);
|
||||
const warning = !healthy
|
||||
? 'Scraper job running >6h with zero throughput in last 6 hours'
|
||||
: null;
|
||||
|
||||
return NextResponse.json({
|
||||
throughput: { last1h, last6h, last24h },
|
||||
runningJobs: jobs,
|
||||
queue,
|
||||
healthy,
|
||||
warning,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error in scraper health:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to get scraper health' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
57
src/app/globals.css
Normal file
57
src/app/globals.css
Normal file
@@ -0,0 +1,57 @@
|
||||
@import "tailwindcss";
|
||||
|
||||
:root {
|
||||
--sacred-gold: #D4AF37;
|
||||
--soft-burgundy: #8B3A62;
|
||||
--deep-purple: #4A2545;
|
||||
--cream: #FAF8F3;
|
||||
--warm-white: #FFFBF5;
|
||||
|
||||
--background: var(--warm-white);
|
||||
--foreground: var(--deep-purple);
|
||||
--color-primary: var(--deep-purple);
|
||||
--color-primary-light: var(--soft-burgundy);
|
||||
--color-accent: var(--sacred-gold);
|
||||
--color-success: #16a34a;
|
||||
--color-warning: #ca8a04;
|
||||
--color-error: #dc2626;
|
||||
--color-card: #ffffff;
|
||||
--color-card-border: rgba(212, 175, 55, 0.2);
|
||||
--color-text-secondary: var(--soft-burgundy);
|
||||
--color-text-muted: #8B6A7A;
|
||||
--color-input-bg: #ffffff;
|
||||
--color-input-border: rgba(74, 37, 69, 0.2);
|
||||
}
|
||||
|
||||
@media (prefers-color-scheme: dark) {
|
||||
:root:not([data-theme="light"]) {
|
||||
--background: #1a0f19;
|
||||
--foreground: var(--cream);
|
||||
--color-primary: var(--sacred-gold);
|
||||
--color-primary-light: #c9a030;
|
||||
--color-card: #261a25;
|
||||
--color-card-border: rgba(212, 175, 55, 0.15);
|
||||
--color-text-secondary: #d4c0cb;
|
||||
--color-text-muted: #a08a96;
|
||||
--color-input-bg: #2d1f2c;
|
||||
--color-input-border: rgba(212, 175, 55, 0.2);
|
||||
}
|
||||
}
|
||||
|
||||
:root[data-theme="dark"] {
|
||||
--background: #1a0f19;
|
||||
--foreground: var(--cream);
|
||||
--color-primary: var(--sacred-gold);
|
||||
--color-primary-light: #c9a030;
|
||||
--color-card: #261a25;
|
||||
--color-card-border: rgba(212, 175, 55, 0.15);
|
||||
--color-text-secondary: #d4c0cb;
|
||||
--color-text-muted: #a08a96;
|
||||
--color-input-bg: #2d1f2c;
|
||||
--color-input-border: rgba(212, 175, 55, 0.2);
|
||||
}
|
||||
|
||||
@theme inline {
|
||||
--color-background: var(--background);
|
||||
--color-foreground: var(--foreground);
|
||||
}
|
||||
37
src/app/layout.tsx
Normal file
37
src/app/layout.tsx
Normal file
@@ -0,0 +1,37 @@
|
||||
import type { Metadata } from 'next';
|
||||
import './globals.css';
|
||||
|
||||
export const metadata: Metadata = {
|
||||
title: 'ScraperControl',
|
||||
robots: 'noindex, nofollow',
|
||||
};
|
||||
|
||||
export default function RootLayout({
|
||||
children,
|
||||
}: {
|
||||
children: React.ReactNode;
|
||||
}) {
|
||||
return (
|
||||
<html lang="en" suppressHydrationWarning>
|
||||
<head>
|
||||
<script
|
||||
dangerouslySetInnerHTML={{
|
||||
__html: `
|
||||
(function() {
|
||||
try {
|
||||
var theme = localStorage.getItem('theme');
|
||||
if (theme === 'dark' || theme === 'light') {
|
||||
document.documentElement.setAttribute('data-theme', theme);
|
||||
}
|
||||
} catch(e) {}
|
||||
})();
|
||||
`,
|
||||
}}
|
||||
/>
|
||||
</head>
|
||||
<body className="bg-[var(--background)] text-[var(--foreground)] antialiased">
|
||||
{children}
|
||||
</body>
|
||||
</html>
|
||||
);
|
||||
}
|
||||
870
src/app/page.tsx
Normal file
870
src/app/page.tsx
Normal file
@@ -0,0 +1,870 @@
|
||||
'use client';
|
||||
|
||||
import { useState, useEffect, useCallback } from 'react';
|
||||
|
||||
interface BackgroundJob {
|
||||
id: string;
|
||||
type: string;
|
||||
language: string | null;
|
||||
status: string;
|
||||
totalItems: number;
|
||||
processed: number;
|
||||
succeeded: number;
|
||||
failed: number;
|
||||
itemsFound: number;
|
||||
error: string | null;
|
||||
config: Record<string, unknown> | null;
|
||||
startedAt: string | null;
|
||||
completedAt: string | null;
|
||||
createdAt: string;
|
||||
}
|
||||
|
||||
interface ChurchStats {
|
||||
totalChurches: number;
|
||||
withWebsites: number;
|
||||
scraped: number;
|
||||
withSchedules: number;
|
||||
byLanguage: Record<string, number>;
|
||||
enrichment: {
|
||||
googlePlacesEnriched: number;
|
||||
freeSearchSearched: number;
|
||||
freeSearchFound: number;
|
||||
};
|
||||
}
|
||||
|
||||
interface ScrapeLogEntry {
|
||||
id: string;
|
||||
name: string;
|
||||
website: string | null;
|
||||
massScheduleUrl: string | null;
|
||||
country: string;
|
||||
city: string | null;
|
||||
websiteLanguage: string | null;
|
||||
lastScrapedAt: string;
|
||||
strategy: string;
|
||||
failureCount: number;
|
||||
scheduleCount: number;
|
||||
success: boolean;
|
||||
}
|
||||
|
||||
interface FreeSearchLogEntry {
|
||||
id: string;
|
||||
name: string;
|
||||
city: string | null;
|
||||
country: string;
|
||||
searchQuery: string;
|
||||
freeSearchedAt: string;
|
||||
found: boolean;
|
||||
website: string | null;
|
||||
}
|
||||
|
||||
type Tab = 'jobs' | 'scrapeLog' | 'freeSearchLog';
|
||||
type ScrapeLogFilter = 'all' | 'success' | 'failed';
|
||||
type FreeSearchLogFilter = 'all' | 'found' | 'not-found';
|
||||
|
||||
function getStoredKey() {
|
||||
if (typeof window !== 'undefined') return sessionStorage.getItem('admin-api-key') || '';
|
||||
return '';
|
||||
}
|
||||
|
||||
function formatDate(iso: string) {
|
||||
return new Date(iso).toLocaleDateString('en-US', {
|
||||
month: 'short',
|
||||
day: 'numeric',
|
||||
year: 'numeric',
|
||||
hour: '2-digit',
|
||||
minute: '2-digit',
|
||||
});
|
||||
}
|
||||
|
||||
function formatRelativeTime(iso: string): string {
|
||||
const diff = Date.now() - new Date(iso).getTime();
|
||||
const minutes = Math.floor(diff / 60000);
|
||||
if (minutes < 1) return 'just now';
|
||||
if (minutes < 60) return `${minutes}m ago`;
|
||||
const hours = Math.floor(minutes / 60);
|
||||
if (hours < 24) return `${hours}h ago`;
|
||||
const days = Math.floor(hours / 24);
|
||||
return `${days}d ago`;
|
||||
}
|
||||
|
||||
function formatNumber(n: number): string {
|
||||
return n.toLocaleString();
|
||||
}
|
||||
|
||||
const LANG_LABELS: Record<string, string> = {
|
||||
en: 'English', fr: 'French', de: 'German', es: 'Spanish',
|
||||
pt: 'Portuguese', pl: 'Polish', cs: 'Czech', nl: 'Dutch',
|
||||
hu: 'Hungarian', it: 'Italian', hr: 'Croatian', sk: 'Slovak',
|
||||
tl: 'Filipino', vi: 'Vietnamese',
|
||||
};
|
||||
|
||||
const JOB_TYPE_LABELS: Record<string, string> = {
|
||||
'scraper': 'Scraper',
|
||||
'google-enrichment': 'Google Places',
|
||||
'freesearch-enrichment': 'FreeSearch',
|
||||
'reverse-geocode-enrichment': 'Reverse Geocode',
|
||||
};
|
||||
|
||||
export default function ScraperControlPage() {
|
||||
const [apiKey, setApiKey] = useState('');
|
||||
const [authenticated, setAuthenticated] = useState(false);
|
||||
const [authError, setAuthError] = useState('');
|
||||
const [authLoading, setAuthLoading] = useState(false);
|
||||
const [initState, setInitState] = useState<'idle' | 'checking' | 'done'>('idle');
|
||||
|
||||
const [tab, setTab] = useState<Tab>('jobs');
|
||||
|
||||
// Jobs state
|
||||
const [jobs, setJobs] = useState<BackgroundJob[]>([]);
|
||||
const [stats, setStats] = useState<ChurchStats | null>(null);
|
||||
const [jobsLoading, setJobsLoading] = useState(false);
|
||||
|
||||
// Scrape Log state
|
||||
const [scrapeLog, setScrapeLog] = useState<ScrapeLogEntry[]>([]);
|
||||
const [scrapeLogTotal, setScrapeLogTotal] = useState(0);
|
||||
const [scrapeLogFilter, setScrapeLogFilter] = useState<ScrapeLogFilter>('all');
|
||||
const [scrapeLogLoading, setScrapeLogLoading] = useState(false);
|
||||
const [scrapeLogOffset, setScrapeLogOffset] = useState(0);
|
||||
|
||||
// FreeSearch Log state
|
||||
const [freeSearchLog, setFreeSearchLog] = useState<FreeSearchLogEntry[]>([]);
|
||||
const [freeSearchLogTotal, setFreeSearchLogTotal] = useState(0);
|
||||
const [freeSearchLogFilter, setFreeSearchLogFilter] = useState<FreeSearchLogFilter>('all');
|
||||
const [freeSearchLogLoading, setFreeSearchLogLoading] = useState(false);
|
||||
const [freeSearchLogOffset, setFreeSearchLogOffset] = useState(0);
|
||||
|
||||
// New job form
|
||||
const [newJobType, setNewJobType] = useState<string>('scraper');
|
||||
const [newJobLanguage, setNewJobLanguage] = useState<string>('english');
|
||||
const [newJobCountry, setNewJobCountry] = useState<string>('');
|
||||
const [newJobLimit, setNewJobLimit] = useState<string>('500');
|
||||
const [newJobReSearch, setNewJobReSearch] = useState(false);
|
||||
const [newJobContinuous, setNewJobContinuous] = useState(false);
|
||||
const [startingJob, setStartingJob] = useState(false);
|
||||
|
||||
function getHeaders() {
|
||||
return { 'x-api-key': getStoredKey(), 'Content-Type': 'application/json' };
|
||||
}
|
||||
|
||||
// Check session on mount
|
||||
useEffect(() => {
|
||||
const key = getStoredKey();
|
||||
if (!key) {
|
||||
// eslint-disable-next-line react-hooks/set-state-in-effect
|
||||
setInitState('done');
|
||||
return;
|
||||
}
|
||||
fetch('/api/admin/jobs?limit=1', {
|
||||
headers: { 'x-api-key': key },
|
||||
}).then((res) => {
|
||||
if (res.status === 401) {
|
||||
sessionStorage.removeItem('admin-api-key');
|
||||
} else {
|
||||
setAuthenticated(true);
|
||||
}
|
||||
setInitState('done');
|
||||
}).catch(() => {
|
||||
setInitState('done');
|
||||
});
|
||||
}, []);
|
||||
|
||||
// Load jobs + stats when on jobs tab, auto-refresh every 10s
|
||||
const loadJobs = useCallback(async () => {
|
||||
if (!authenticated) return;
|
||||
setJobsLoading(true);
|
||||
try {
|
||||
const res = await fetch('/api/admin/jobs', { headers: getHeaders() });
|
||||
if (res.ok) {
|
||||
const data = await res.json();
|
||||
setJobs(data.jobs);
|
||||
setStats(data.stats);
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
setJobsLoading(false);
|
||||
}, [authenticated]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!authenticated || tab !== 'jobs') return;
|
||||
loadJobs();
|
||||
const interval = setInterval(loadJobs, 10000);
|
||||
return () => clearInterval(interval);
|
||||
}, [authenticated, tab, loadJobs]);
|
||||
|
||||
// Load scrape log when on scrapeLog tab
|
||||
useEffect(() => {
|
||||
if (!authenticated || tab !== 'scrapeLog') return;
|
||||
let cancelled = false;
|
||||
|
||||
async function load() {
|
||||
setScrapeLogLoading(true);
|
||||
try {
|
||||
const params = new URLSearchParams({
|
||||
filter: scrapeLogFilter,
|
||||
limit: '50',
|
||||
offset: String(scrapeLogOffset),
|
||||
});
|
||||
const res = await fetch(`/api/admin/scrape-log?${params}`, { headers: getHeaders() });
|
||||
if (res.ok && !cancelled) {
|
||||
const data = await res.json();
|
||||
setScrapeLog(data.results);
|
||||
setScrapeLogTotal(data.total);
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
if (!cancelled) setScrapeLogLoading(false);
|
||||
}
|
||||
|
||||
load();
|
||||
return () => { cancelled = true; };
|
||||
}, [authenticated, tab, scrapeLogFilter, scrapeLogOffset]);
|
||||
|
||||
// Load freesearch log when on freeSearchLog tab
|
||||
useEffect(() => {
|
||||
if (!authenticated || tab !== 'freeSearchLog') return;
|
||||
let cancelled = false;
|
||||
|
||||
async function load() {
|
||||
setFreeSearchLogLoading(true);
|
||||
try {
|
||||
const params = new URLSearchParams({
|
||||
filter: freeSearchLogFilter,
|
||||
limit: '50',
|
||||
offset: String(freeSearchLogOffset),
|
||||
});
|
||||
const res = await fetch(`/api/admin/freesearch-log?${params}`, { headers: getHeaders() });
|
||||
if (res.ok && !cancelled) {
|
||||
const data = await res.json();
|
||||
setFreeSearchLog(data.results);
|
||||
setFreeSearchLogTotal(data.total);
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
if (!cancelled) setFreeSearchLogLoading(false);
|
||||
}
|
||||
|
||||
load();
|
||||
return () => { cancelled = true; };
|
||||
}, [authenticated, tab, freeSearchLogFilter, freeSearchLogOffset]);
|
||||
|
||||
async function validateKey(key: string) {
|
||||
setAuthLoading(true);
|
||||
setAuthError('');
|
||||
try {
|
||||
const res = await fetch('/api/admin/jobs?limit=1', {
|
||||
headers: { 'x-api-key': key },
|
||||
});
|
||||
if (res.status === 401) {
|
||||
setAuthError('Invalid API key');
|
||||
sessionStorage.removeItem('admin-api-key');
|
||||
} else {
|
||||
sessionStorage.setItem('admin-api-key', key);
|
||||
setApiKey(key);
|
||||
setAuthenticated(true);
|
||||
}
|
||||
} catch {
|
||||
setAuthError('Connection error');
|
||||
}
|
||||
setAuthLoading(false);
|
||||
}
|
||||
|
||||
function logout() {
|
||||
sessionStorage.removeItem('admin-api-key');
|
||||
setAuthenticated(false);
|
||||
setApiKey('');
|
||||
setJobs([]);
|
||||
setStats(null);
|
||||
}
|
||||
|
||||
async function startJob() {
|
||||
setStartingJob(true);
|
||||
try {
|
||||
const config: Record<string, unknown> = {};
|
||||
if (newJobLimit) config.limit = parseInt(newJobLimit);
|
||||
if (newJobType === 'scraper') config.language = newJobLanguage;
|
||||
if (newJobCountry && newJobType !== 'scraper') config.country = newJobCountry;
|
||||
if (newJobContinuous && (newJobType === 'freesearch-enrichment' || newJobType === 'reverse-geocode-enrichment')) config.continuous = true;
|
||||
if (newJobType === 'freesearch-enrichment' && newJobReSearch) config.reSearch = true;
|
||||
|
||||
await fetch('/api/admin/jobs', {
|
||||
method: 'POST',
|
||||
headers: getHeaders(),
|
||||
body: JSON.stringify({
|
||||
type: newJobType,
|
||||
language: newJobType === 'scraper' ? newJobLanguage : null,
|
||||
config,
|
||||
}),
|
||||
});
|
||||
await loadJobs();
|
||||
} catch { /* ignore */ }
|
||||
setStartingJob(false);
|
||||
}
|
||||
|
||||
async function stopJob(jobId: string) {
|
||||
if (!confirm('Stop this job?')) return;
|
||||
await fetch('/api/admin/jobs', {
|
||||
method: 'PATCH',
|
||||
headers: getHeaders(),
|
||||
body: JSON.stringify({ jobId, action: 'stop' }),
|
||||
});
|
||||
await loadJobs();
|
||||
}
|
||||
|
||||
// Initializing screen
|
||||
if (initState !== 'done') {
|
||||
return (
|
||||
<div className="min-h-screen flex items-center justify-center bg-[var(--background)]">
|
||||
<p className="text-[var(--color-text-muted)]">Loading...</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// Auth screen
|
||||
if (!authenticated) {
|
||||
return (
|
||||
<div className="min-h-screen flex items-center justify-center bg-[var(--background)] p-4">
|
||||
<div className="w-full max-w-sm bg-[var(--color-card)] border border-[var(--color-card-border)] rounded-xl p-6 shadow-lg">
|
||||
<h1 className="text-xl font-semibold text-[var(--color-primary)] mb-4 text-center">
|
||||
ScraperControl
|
||||
</h1>
|
||||
<form
|
||||
onSubmit={(e) => {
|
||||
e.preventDefault();
|
||||
validateKey(apiKey);
|
||||
}}
|
||||
>
|
||||
<input
|
||||
type="password"
|
||||
value={apiKey}
|
||||
onChange={(e) => setApiKey(e.target.value)}
|
||||
placeholder="Enter API key"
|
||||
className="w-full px-3 py-2 rounded-lg border border-[var(--color-input-border)] bg-[var(--color-input-bg)] text-[var(--foreground)] mb-3 focus:outline-none focus:ring-2 focus:ring-[var(--color-accent)]"
|
||||
autoFocus
|
||||
/>
|
||||
{authError && <p className="text-[var(--color-error)] text-sm mb-3">{authError}</p>}
|
||||
<button
|
||||
type="submit"
|
||||
disabled={authLoading || !apiKey}
|
||||
className="w-full py-2 rounded-lg bg-[var(--color-primary)] text-white font-medium disabled:opacity-50 hover:opacity-90 transition-opacity"
|
||||
>
|
||||
{authLoading ? 'Checking...' : 'Sign In'}
|
||||
</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// Helpers for jobs tab
|
||||
const activeJobs = jobs.filter(j => j.status === 'pending' || j.status === 'running' || j.status === 'stopping');
|
||||
const completedJobs = jobs.filter(j => j.status === 'completed' || j.status === 'failed');
|
||||
|
||||
// Dashboard
|
||||
return (
|
||||
<div className="min-h-screen bg-[var(--background)] text-[var(--foreground)] overflow-auto">
|
||||
{/* Header */}
|
||||
<header className="sticky top-0 z-10 bg-[var(--color-card)] border-b border-[var(--color-card-border)] px-4 py-3 flex items-center justify-between shadow-sm">
|
||||
<h1 className="text-lg font-semibold text-[var(--color-primary)]">
|
||||
ScraperControl
|
||||
</h1>
|
||||
<button
|
||||
onClick={logout}
|
||||
className="text-sm px-3 py-1 rounded-lg border border-[var(--color-input-border)] text-[var(--color-text-muted)] hover:text-[var(--color-error)] hover:border-[var(--color-error)] transition-colors"
|
||||
>
|
||||
Logout
|
||||
</button>
|
||||
</header>
|
||||
|
||||
{/* Tabs */}
|
||||
<div className="flex border-b border-[var(--color-card-border)] bg-[var(--color-card)] overflow-x-auto">
|
||||
{([
|
||||
{ key: 'jobs' as Tab, label: 'Jobs', badge: activeJobs.length > 0 ? activeJobs.length : null, badgeColor: 'bg-[var(--color-success)]' },
|
||||
{ key: 'scrapeLog' as Tab, label: 'Scrapes', badge: null, badgeColor: '' },
|
||||
{ key: 'freeSearchLog' as Tab, label: 'Search', badge: null, badgeColor: '' },
|
||||
]).map((t) => (
|
||||
<button
|
||||
key={t.key}
|
||||
onClick={() => setTab(t.key)}
|
||||
className={`flex-1 py-3 text-sm font-medium text-center transition-colors relative whitespace-nowrap px-2 ${
|
||||
tab === t.key
|
||||
? 'text-[var(--color-primary)] border-b-2 border-[var(--color-primary)]'
|
||||
: 'text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]'
|
||||
}`}
|
||||
>
|
||||
{t.label}
|
||||
{t.badge !== null && (
|
||||
<span className={`ml-1 inline-flex items-center justify-center w-5 h-5 text-xs rounded-full text-white ${t.badgeColor}`}>
|
||||
{t.badge}
|
||||
</span>
|
||||
)}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
|
||||
<div className="max-w-3xl mx-auto p-4">
|
||||
{/* Scrape Log Tab */}
|
||||
{tab === 'scrapeLog' && (
|
||||
<>
|
||||
<div className="flex gap-2 mb-4">
|
||||
{(['all', 'success', 'failed'] as ScrapeLogFilter[]).map((f) => (
|
||||
<button
|
||||
key={f}
|
||||
onClick={() => { setScrapeLogFilter(f); setScrapeLogOffset(0); }}
|
||||
className={`px-3 py-1.5 text-sm rounded-lg border transition-colors capitalize ${
|
||||
scrapeLogFilter === f
|
||||
? 'bg-[var(--color-primary)] text-white border-[var(--color-primary)]'
|
||||
: 'border-[var(--color-input-border)] text-[var(--color-text-muted)] hover:border-[var(--color-primary)]'
|
||||
}`}
|
||||
>
|
||||
{f}
|
||||
</button>
|
||||
))}
|
||||
<span className="ml-auto text-sm text-[var(--color-text-muted)] self-center">
|
||||
{formatNumber(scrapeLogTotal)} total
|
||||
</span>
|
||||
</div>
|
||||
|
||||
{scrapeLogLoading ? (
|
||||
<p className="text-center text-[var(--color-text-muted)] py-8">Loading...</p>
|
||||
) : scrapeLog.length === 0 ? (
|
||||
<p className="text-center text-[var(--color-text-muted)] py-8">No scrape results found</p>
|
||||
) : (
|
||||
<div className="space-y-3">
|
||||
{scrapeLog.map((entry) => (
|
||||
<div
|
||||
key={entry.id}
|
||||
className={`bg-[var(--color-card)] border rounded-xl p-4 ${
|
||||
entry.success
|
||||
? 'border-[var(--color-success)]/20'
|
||||
: 'border-[var(--color-error)]/20'
|
||||
}`}
|
||||
>
|
||||
<div className="flex items-start justify-between gap-2 mb-1">
|
||||
<span className="font-medium text-sm text-[var(--foreground)] truncate">
|
||||
{entry.name}
|
||||
</span>
|
||||
<div className="flex items-center gap-2 shrink-0">
|
||||
<span className="text-xs text-[var(--color-text-muted)]">
|
||||
{entry.country}{entry.city ? ` \u00B7 ${entry.city}` : ''}
|
||||
</span>
|
||||
<span className="text-xs text-[var(--color-text-muted)]">
|
||||
{formatRelativeTime(entry.lastScrapedAt)}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
{entry.massScheduleUrl && entry.massScheduleUrl !== entry.website && (
|
||||
<a
|
||||
href={entry.massScheduleUrl}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-xs text-[var(--color-accent)] hover:underline truncate block"
|
||||
>
|
||||
{entry.massScheduleUrl}
|
||||
</a>
|
||||
)}
|
||||
{entry.website && (
|
||||
<a
|
||||
href={entry.website}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-xs text-[var(--color-accent)] hover:underline truncate block"
|
||||
>
|
||||
{entry.website}
|
||||
</a>
|
||||
)}
|
||||
<div className="flex items-center gap-2 mt-1.5 text-xs">
|
||||
<span className="text-[var(--color-text-muted)]">
|
||||
{entry.websiteLanguage || entry.strategy}
|
||||
</span>
|
||||
{entry.success ? (
|
||||
<span className="text-[var(--color-success)] font-medium">
|
||||
{entry.scheduleCount} schedule{entry.scheduleCount !== 1 ? 's' : ''}
|
||||
</span>
|
||||
) : (
|
||||
<span className="text-[var(--color-error)] font-medium">
|
||||
Failed{entry.failureCount > 1 ? ` (${entry.failureCount} attempts)` : ''}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Pagination */}
|
||||
{scrapeLogTotal > 50 && (
|
||||
<div className="flex items-center justify-between mt-4 text-sm text-[var(--color-text-muted)]">
|
||||
<span>
|
||||
{scrapeLogOffset + 1}-{Math.min(scrapeLogOffset + 50, scrapeLogTotal)} of {formatNumber(scrapeLogTotal)}
|
||||
</span>
|
||||
<div className="flex gap-2">
|
||||
<button
|
||||
onClick={() => setScrapeLogOffset(Math.max(0, scrapeLogOffset - 50))}
|
||||
disabled={scrapeLogOffset === 0}
|
||||
className="px-3 py-1 rounded-lg border border-[var(--color-input-border)] disabled:opacity-30 hover:border-[var(--color-primary)] transition-colors"
|
||||
>
|
||||
Prev
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setScrapeLogOffset(scrapeLogOffset + 50)}
|
||||
disabled={scrapeLogOffset + 50 >= scrapeLogTotal}
|
||||
className="px-3 py-1 rounded-lg border border-[var(--color-input-border)] disabled:opacity-30 hover:border-[var(--color-primary)] transition-colors"
|
||||
>
|
||||
Next
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
{/* FreeSearch Log Tab */}
|
||||
{tab === 'freeSearchLog' && (
|
||||
<>
|
||||
<div className="flex gap-2 mb-4">
|
||||
{(['all', 'found', 'not-found'] as FreeSearchLogFilter[]).map((f) => (
|
||||
<button
|
||||
key={f}
|
||||
onClick={() => { setFreeSearchLogFilter(f); setFreeSearchLogOffset(0); }}
|
||||
className={`px-3 py-1.5 text-sm rounded-lg border transition-colors capitalize ${
|
||||
freeSearchLogFilter === f
|
||||
? 'bg-[var(--color-primary)] text-white border-[var(--color-primary)]'
|
||||
: 'border-[var(--color-input-border)] text-[var(--color-text-muted)] hover:border-[var(--color-primary)]'
|
||||
}`}
|
||||
>
|
||||
{f === 'not-found' ? 'Not Found' : f}
|
||||
</button>
|
||||
))}
|
||||
<span className="ml-auto text-sm text-[var(--color-text-muted)] self-center">
|
||||
{formatNumber(freeSearchLogTotal)} total
|
||||
</span>
|
||||
</div>
|
||||
|
||||
{freeSearchLogLoading ? (
|
||||
<p className="text-center text-[var(--color-text-muted)] py-8">Loading...</p>
|
||||
) : freeSearchLog.length === 0 ? (
|
||||
<p className="text-center text-[var(--color-text-muted)] py-8">No search results found</p>
|
||||
) : (
|
||||
<div className="space-y-3">
|
||||
{freeSearchLog.map((entry) => (
|
||||
<div
|
||||
key={entry.id}
|
||||
className={`bg-[var(--color-card)] border rounded-xl p-4 ${
|
||||
entry.found
|
||||
? 'border-[var(--color-success)]/20'
|
||||
: 'border-[var(--color-card-border)]'
|
||||
}`}
|
||||
>
|
||||
<div className="flex items-start justify-between gap-2 mb-1">
|
||||
<span className="font-medium text-sm text-[var(--foreground)] truncate">
|
||||
{entry.name}
|
||||
</span>
|
||||
<div className="flex items-center gap-2 shrink-0">
|
||||
<span className="text-xs text-[var(--color-text-muted)]">
|
||||
{entry.country}{entry.city ? ` \u00B7 ${entry.city}` : ''}
|
||||
</span>
|
||||
<span className="text-xs text-[var(--color-text-muted)]">
|
||||
{formatRelativeTime(entry.freeSearchedAt)}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
<p className="text-xs text-[var(--color-text-muted)] font-mono truncate mb-1">
|
||||
{entry.searchQuery}
|
||||
</p>
|
||||
{entry.found && entry.website ? (
|
||||
<a
|
||||
href={entry.website}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-xs text-[var(--color-success)] hover:underline truncate block font-medium"
|
||||
>
|
||||
{entry.website}
|
||||
</a>
|
||||
) : (
|
||||
<span className="text-xs text-[var(--color-text-muted)] italic">
|
||||
No website found
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Pagination */}
|
||||
{freeSearchLogTotal > 50 && (
|
||||
<div className="flex items-center justify-between mt-4 text-sm text-[var(--color-text-muted)]">
|
||||
<span>
|
||||
{freeSearchLogOffset + 1}-{Math.min(freeSearchLogOffset + 50, freeSearchLogTotal)} of {formatNumber(freeSearchLogTotal)}
|
||||
</span>
|
||||
<div className="flex gap-2">
|
||||
<button
|
||||
onClick={() => setFreeSearchLogOffset(Math.max(0, freeSearchLogOffset - 50))}
|
||||
disabled={freeSearchLogOffset === 0}
|
||||
className="px-3 py-1 rounded-lg border border-[var(--color-input-border)] disabled:opacity-30 hover:border-[var(--color-primary)] transition-colors"
|
||||
>
|
||||
Prev
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setFreeSearchLogOffset(freeSearchLogOffset + 50)}
|
||||
disabled={freeSearchLogOffset + 50 >= freeSearchLogTotal}
|
||||
className="px-3 py-1 rounded-lg border border-[var(--color-input-border)] disabled:opacity-30 hover:border-[var(--color-primary)] transition-colors"
|
||||
>
|
||||
Next
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
{/* Jobs Tab */}
|
||||
{tab === 'jobs' && (
|
||||
<>
|
||||
{jobsLoading && !stats ? (
|
||||
<p className="text-center text-[var(--color-text-muted)] py-8">Loading...</p>
|
||||
) : (
|
||||
<>
|
||||
{/* Church Database Stats */}
|
||||
{stats && (
|
||||
<div className="bg-[var(--color-card)] border border-[var(--color-card-border)] rounded-xl p-4 mb-4">
|
||||
<h3 className="text-sm font-semibold text-[var(--foreground)] mb-3">Church Database</h3>
|
||||
<div className="grid grid-cols-2 gap-3 text-sm">
|
||||
<div>
|
||||
<span className="text-[var(--color-text-muted)]">Total</span>
|
||||
<span className="float-right font-medium">{formatNumber(stats.totalChurches)}</span>
|
||||
</div>
|
||||
<div>
|
||||
<span className="text-[var(--color-text-muted)]">Websites</span>
|
||||
<span className="float-right font-medium">{formatNumber(stats.withWebsites)}</span>
|
||||
</div>
|
||||
<div>
|
||||
<span className="text-[var(--color-text-muted)]">Scraped</span>
|
||||
<span className="float-right font-medium">{formatNumber(stats.scraped)}</span>
|
||||
</div>
|
||||
<div>
|
||||
<span className="text-[var(--color-text-muted)]">With Schedules</span>
|
||||
<span className="float-right font-medium">{formatNumber(stats.withSchedules)}</span>
|
||||
</div>
|
||||
<div>
|
||||
<span className="text-[var(--color-text-muted)]">Google Enriched</span>
|
||||
<span className="float-right font-medium">{formatNumber(stats.enrichment.googlePlacesEnriched)}</span>
|
||||
</div>
|
||||
<div>
|
||||
<span className="text-[var(--color-text-muted)]">FreeSearch Found</span>
|
||||
<span className="float-right font-medium">{formatNumber(stats.enrichment.freeSearchFound)}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Language Breakdown */}
|
||||
{stats && Object.keys(stats.byLanguage).length > 0 && (
|
||||
<div className="bg-[var(--color-card)] border border-[var(--color-card-border)] rounded-xl p-4 mb-4">
|
||||
<h3 className="text-sm font-semibold text-[var(--foreground)] mb-3">Churches by Website Language</h3>
|
||||
<div className="space-y-2">
|
||||
{Object.entries(stats.byLanguage)
|
||||
.sort(([, a], [, b]) => b - a)
|
||||
.slice(0, 10)
|
||||
.map(([lang, count]) => {
|
||||
const total = Object.values(stats.byLanguage).reduce((a, b) => a + b, 0);
|
||||
const pct = total > 0 ? (count / total) * 100 : 0;
|
||||
return (
|
||||
<div key={lang} className="flex items-center gap-2 text-sm">
|
||||
<span className="w-16 text-[var(--color-text-muted)]">{LANG_LABELS[lang] || lang}</span>
|
||||
<div className="flex-1 h-4 bg-[var(--color-input-bg)] rounded-full overflow-hidden">
|
||||
<div
|
||||
className="h-full bg-[var(--color-primary)] rounded-full"
|
||||
style={{ width: `${Math.max(pct, 1)}%` }}
|
||||
/>
|
||||
</div>
|
||||
<span className="w-16 text-right text-[var(--color-text-muted)]">{formatNumber(count)}</span>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Active Jobs */}
|
||||
{activeJobs.length > 0 && (
|
||||
<div className="mb-4">
|
||||
<h3 className="text-sm font-semibold text-[var(--foreground)] mb-2">Active Jobs</h3>
|
||||
<div className="space-y-2">
|
||||
{activeJobs.map((job) => {
|
||||
const pct = job.totalItems > 0 ? (job.processed / job.totalItems) * 100 : 0;
|
||||
return (
|
||||
<div key={job.id} className="bg-[var(--color-card)] border border-[var(--color-accent)]/30 rounded-xl p-4">
|
||||
<div className="flex items-center justify-between mb-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="font-medium text-sm">
|
||||
{JOB_TYPE_LABELS[job.type] || job.type}
|
||||
{job.language ? `: ${job.language}` : ''}
|
||||
</span>
|
||||
<span className={`text-[10px] px-1.5 py-0.5 rounded-full text-white font-medium ${
|
||||
job.status === 'pending' ? 'bg-[var(--color-text-muted)]' :
|
||||
job.status === 'stopping' ? 'bg-[var(--color-warning)]' : 'bg-[var(--color-success)]'
|
||||
}`}>
|
||||
{job.status === 'pending' ? 'PENDING' : job.status === 'stopping' ? 'STOPPING' : 'RUNNING'}
|
||||
</span>
|
||||
</div>
|
||||
{job.status === 'running' && (
|
||||
<button
|
||||
onClick={() => stopJob(job.id)}
|
||||
className="text-xs px-2.5 py-1 rounded-lg border border-[var(--color-error)]/50 text-[var(--color-error)] hover:bg-[var(--color-error)]/10 transition-colors"
|
||||
>
|
||||
Stop
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
{/* Progress bar */}
|
||||
<div className="h-2 bg-[var(--color-input-bg)] rounded-full overflow-hidden mb-2">
|
||||
<div
|
||||
className="h-full bg-[var(--color-accent)] rounded-full transition-all"
|
||||
style={{ width: `${Math.max(pct, 0.5)}%` }}
|
||||
/>
|
||||
</div>
|
||||
<div className="flex items-center gap-3 text-xs text-[var(--color-text-muted)]">
|
||||
<span>{formatNumber(job.processed)}/{formatNumber(job.totalItems)} ({pct.toFixed(1)}%)</span>
|
||||
<span>{formatNumber(job.itemsFound)} found</span>
|
||||
{job.failed > 0 && <span className="text-[var(--color-error)]">{job.failed} failed</span>}
|
||||
{job.startedAt && <span className="ml-auto">{formatRelativeTime(job.startedAt)}</span>}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Start New Job */}
|
||||
<div className="bg-[var(--color-card)] border border-[var(--color-card-border)] rounded-xl p-4 mb-4">
|
||||
<h3 className="text-sm font-semibold text-[var(--foreground)] mb-3">Start New Job</h3>
|
||||
<div className="flex gap-2 flex-wrap items-end">
|
||||
<div>
|
||||
<label className="text-xs text-[var(--color-text-muted)] block mb-1">Type</label>
|
||||
<select
|
||||
value={newJobType}
|
||||
onChange={(e) => setNewJobType(e.target.value)}
|
||||
className="px-2 py-1.5 text-sm rounded-lg border border-[var(--color-input-border)] bg-[var(--color-input-bg)] text-[var(--foreground)]"
|
||||
>
|
||||
<option value="scraper">Scraper</option>
|
||||
<option value="google-enrichment">Google Places</option>
|
||||
<option value="freesearch-enrichment">FreeSearch</option>
|
||||
<option value="reverse-geocode-enrichment">Reverse Geocode</option>
|
||||
</select>
|
||||
</div>
|
||||
{newJobType === 'scraper' && (
|
||||
<div>
|
||||
<label className="text-xs text-[var(--color-text-muted)] block mb-1">Language</label>
|
||||
<select
|
||||
value={newJobLanguage}
|
||||
onChange={(e) => setNewJobLanguage(e.target.value)}
|
||||
className="px-2 py-1.5 text-sm rounded-lg border border-[var(--color-input-border)] bg-[var(--color-input-bg)] text-[var(--foreground)]"
|
||||
>
|
||||
<option value="english">English</option>
|
||||
<option value="generic">Generic</option>
|
||||
</select>
|
||||
</div>
|
||||
)}
|
||||
{newJobType !== 'scraper' && (
|
||||
<div>
|
||||
<label className="text-xs text-[var(--color-text-muted)] block mb-1">Country</label>
|
||||
<input
|
||||
type="text"
|
||||
placeholder="e.g. FR"
|
||||
value={newJobCountry}
|
||||
onChange={(e) => setNewJobCountry(e.target.value.toUpperCase())}
|
||||
className="w-16 px-2 py-1.5 text-sm rounded-lg border border-[var(--color-input-border)] bg-[var(--color-input-bg)] text-[var(--foreground)]"
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
<div>
|
||||
<label className="text-xs text-[var(--color-text-muted)] block mb-1">Limit</label>
|
||||
<input
|
||||
type="number"
|
||||
value={newJobLimit}
|
||||
onChange={(e) => setNewJobLimit(e.target.value)}
|
||||
className="w-20 px-2 py-1.5 text-sm rounded-lg border border-[var(--color-input-border)] bg-[var(--color-input-bg)] text-[var(--foreground)]"
|
||||
/>
|
||||
</div>
|
||||
{(newJobType === 'freesearch-enrichment' || newJobType === 'reverse-geocode-enrichment') && (
|
||||
<label className="flex items-center gap-1.5 text-sm text-[var(--foreground)] cursor-pointer py-1.5">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={newJobContinuous}
|
||||
onChange={(e) => setNewJobContinuous(e.target.checked)}
|
||||
className="rounded"
|
||||
/>
|
||||
Continuous
|
||||
</label>
|
||||
)}
|
||||
{newJobType === 'freesearch-enrichment' && (
|
||||
<label className="flex items-center gap-1.5 text-sm text-[var(--foreground)] cursor-pointer py-1.5">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={newJobReSearch}
|
||||
onChange={(e) => setNewJobReSearch(e.target.checked)}
|
||||
className="rounded"
|
||||
/>
|
||||
Re-search
|
||||
</label>
|
||||
)}
|
||||
<button
|
||||
onClick={startJob}
|
||||
disabled={startingJob}
|
||||
className="px-4 py-1.5 text-sm rounded-lg bg-[var(--color-primary)] text-white font-medium disabled:opacity-50 hover:opacity-90 transition-opacity"
|
||||
>
|
||||
{startingJob ? 'Starting...' : 'Start'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Job History */}
|
||||
{completedJobs.length > 0 && (
|
||||
<div>
|
||||
<h3 className="text-sm font-semibold text-[var(--foreground)] mb-2">Job History</h3>
|
||||
<div className="space-y-2">
|
||||
{completedJobs.slice(0, 20).map((job) => (
|
||||
<div
|
||||
key={job.id}
|
||||
className={`bg-[var(--color-card)] border rounded-xl p-3 ${
|
||||
job.status === 'completed'
|
||||
? 'border-[var(--color-success)]/20'
|
||||
: 'border-[var(--color-error)]/20'
|
||||
}`}
|
||||
>
|
||||
<div className="flex items-center justify-between mb-1">
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-sm font-medium">
|
||||
{JOB_TYPE_LABELS[job.type] || job.type}
|
||||
{job.language ? `: ${job.language}` : ''}
|
||||
</span>
|
||||
<span className={`text-[10px] px-1.5 py-0.5 rounded-full text-white font-medium ${
|
||||
job.status === 'completed' ? 'bg-[var(--color-success)]' : 'bg-[var(--color-error)]'
|
||||
}`}>
|
||||
{job.status === 'completed' ? 'DONE' : 'FAILED'}
|
||||
</span>
|
||||
</div>
|
||||
<span className="text-xs text-[var(--color-text-muted)]">
|
||||
{job.completedAt ? formatRelativeTime(job.completedAt) : ''}
|
||||
</span>
|
||||
</div>
|
||||
<div className="flex items-center gap-3 text-xs text-[var(--color-text-muted)]">
|
||||
<span>{formatNumber(job.processed)} processed</span>
|
||||
<span>{formatNumber(job.itemsFound)} found</span>
|
||||
{job.failed > 0 && <span className="text-[var(--color-error)]">{job.failed} failed</span>}
|
||||
</div>
|
||||
{job.error && (
|
||||
<p className="text-xs text-[var(--color-error)] mt-1 truncate">{job.error}</p>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
15
src/chromadb/client.ts
Normal file
15
src/chromadb/client.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
import { ChromaClient } from 'chromadb';
|
||||
|
||||
let clientInstance: ChromaClient | null = null;
|
||||
|
||||
/**
|
||||
* Get or create a ChromaDB client singleton.
|
||||
* Connects to the ChromaDB server specified by CHROMADB_URL env var.
|
||||
*/
|
||||
export function getChromaClient(): ChromaClient {
|
||||
if (!clientInstance) {
|
||||
const url = process.env.CHROMADB_URL || 'http://192.168.0.145:8000';
|
||||
clientInstance = new ChromaClient({ path: url });
|
||||
}
|
||||
return clientInstance;
|
||||
}
|
||||
52
src/chromadb/collections.ts
Normal file
52
src/chromadb/collections.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
import { Collection } from 'chromadb';
|
||||
import { getChromaClient } from './client';
|
||||
|
||||
export const COLLECTION_NAMES = {
|
||||
CHURCH_IDENTITY: 'church_identity',
|
||||
SEARCH_RESULTS: 'search_results',
|
||||
PAGE_CLASSIFICATION: 'page_classification',
|
||||
SCHEDULE_SECTIONS: 'schedule_sections',
|
||||
PAGE_SNAPSHOTS: 'page_snapshots',
|
||||
} as const;
|
||||
|
||||
export type CollectionName = (typeof COLLECTION_NAMES)[keyof typeof COLLECTION_NAMES];
|
||||
|
||||
/**
|
||||
* Collection metadata schemas:
|
||||
*
|
||||
* church_identity:
|
||||
* Documents: "{name} {address} {city} {country}"
|
||||
* Metadata: churchId, country, source, lat, lng
|
||||
* Use: Deduplication
|
||||
*
|
||||
* search_results:
|
||||
* Documents: "{title} {description} {url}"
|
||||
* Metadata: churchId, churchName, churchCity, churchCountry, searchQuery, resultUrl, resultTitle, score, verified
|
||||
* Use: Store all FreeSearch results for cross-church matching and re-analysis
|
||||
*
|
||||
* page_classification:
|
||||
* Documents: Page text (first ~2000 chars)
|
||||
* Metadata: url, isMassSchedulePage, language
|
||||
* Use: Content classification
|
||||
*
|
||||
* schedule_sections:
|
||||
* Documents: Text block with mass times
|
||||
* Metadata: language, daysCovered, sourceUrl
|
||||
* Use: Schedule detection
|
||||
*
|
||||
* page_snapshots:
|
||||
* Documents: Full page text
|
||||
* Metadata: churchId, url, scrapeDate, scheduleHash
|
||||
* Use: Change detection
|
||||
*/
|
||||
|
||||
/**
|
||||
* Get or create a ChromaDB collection by name.
|
||||
*/
|
||||
export async function getCollection(name: CollectionName): Promise<Collection> {
|
||||
const client = getChromaClient();
|
||||
return client.getOrCreateCollection({
|
||||
name,
|
||||
metadata: { 'hnsw:space': 'cosine' },
|
||||
});
|
||||
}
|
||||
43
src/chromadb/embeddings.ts
Normal file
43
src/chromadb/embeddings.ts
Normal file
@@ -0,0 +1,43 @@
|
||||
import OpenAI from 'openai';
|
||||
|
||||
const client = new OpenAI({
|
||||
baseURL: process.env.EMBEDDING_API_URL || 'http://192.168.0.75:11434/v1',
|
||||
apiKey: 'unused', // Ollama doesn't require an API key
|
||||
});
|
||||
|
||||
const model = process.env.EMBEDDING_MODEL || 'nomic-embed-text';
|
||||
|
||||
/**
|
||||
* Generate embeddings for one or more texts using the local OpenAI-compatible API (Ollama).
|
||||
* Batches requests to avoid overwhelming the embedding server.
|
||||
*/
|
||||
export async function embed(texts: string[]): Promise<number[][]> {
|
||||
if (texts.length === 0) return [];
|
||||
|
||||
const BATCH_SIZE = 32;
|
||||
const allEmbeddings: number[][] = [];
|
||||
|
||||
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
||||
const batch = texts.slice(i, i + BATCH_SIZE);
|
||||
const response = await client.embeddings.create({
|
||||
model,
|
||||
input: batch,
|
||||
});
|
||||
|
||||
const batchEmbeddings = response.data
|
||||
.sort((a, b) => a.index - b.index)
|
||||
.map((item) => item.embedding);
|
||||
|
||||
allEmbeddings.push(...batchEmbeddings);
|
||||
}
|
||||
|
||||
return allEmbeddings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a single embedding for one text.
|
||||
*/
|
||||
export async function embedSingle(text: string): Promise<number[]> {
|
||||
const [embedding] = await embed([text]);
|
||||
return embedding;
|
||||
}
|
||||
250
src/chromadb/queries.ts
Normal file
250
src/chromadb/queries.ts
Normal file
@@ -0,0 +1,250 @@
|
||||
import { getCollection, COLLECTION_NAMES } from './collections';
|
||||
import { embed, embedSingle } from './embeddings';
|
||||
import crypto from 'crypto';
|
||||
|
||||
interface SimilarChurchResult {
|
||||
id: string;
|
||||
churchId: string;
|
||||
distance: number;
|
||||
document: string;
|
||||
metadata: Record<string, unknown>;
|
||||
}
|
||||
|
||||
interface SearchMatchResult {
|
||||
id: string;
|
||||
churchId: string;
|
||||
distance: number;
|
||||
document: string;
|
||||
metadata: Record<string, unknown>;
|
||||
}
|
||||
|
||||
interface PageClassificationResult {
|
||||
isMassSchedulePage: boolean;
|
||||
confidence: number;
|
||||
nearestDocument: string;
|
||||
nearestMetadata: Record<string, unknown>;
|
||||
}
|
||||
|
||||
interface ScheduleSectionResult {
|
||||
id: string;
|
||||
distance: number;
|
||||
document: string;
|
||||
metadata: Record<string, unknown>;
|
||||
}
|
||||
|
||||
interface PageChangeResult {
|
||||
hasChanged: boolean;
|
||||
changeScore: number; // 0 = identical, 1 = completely different
|
||||
previousHash: string | null;
|
||||
currentHash: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find churches with similar identity text (name, address, city, country).
|
||||
* Used for deduplication.
|
||||
*/
|
||||
export async function findSimilarChurches(
|
||||
text: string,
|
||||
options: { country?: string; nResults?: number } = {}
|
||||
): Promise<SimilarChurchResult[]> {
|
||||
const { country, nResults = 10 } = options;
|
||||
const collection = await getCollection(COLLECTION_NAMES.CHURCH_IDENTITY);
|
||||
const queryEmbedding = await embedSingle(text);
|
||||
|
||||
const whereFilter = country ? { country } : undefined;
|
||||
|
||||
const results = await collection.query({
|
||||
queryEmbeddings: [queryEmbedding],
|
||||
nResults,
|
||||
where: whereFilter,
|
||||
});
|
||||
|
||||
if (!results.ids[0]) return [];
|
||||
|
||||
return results.ids[0].map((id, i) => ({
|
||||
id,
|
||||
churchId: (results.metadatas[0][i]?.churchId as string) || '',
|
||||
distance: results.distances?.[0]?.[i] ?? 1,
|
||||
document: results.documents[0][i] || '',
|
||||
metadata: (results.metadatas[0][i] as Record<string, unknown>) || {},
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Match a FreeSearch result against known churches using semantic similarity.
|
||||
* Supplements domain-matching logic.
|
||||
*/
|
||||
export async function matchSearchResultToChurch(
|
||||
text: string,
|
||||
options: { country?: string; nResults?: number } = {}
|
||||
): Promise<SearchMatchResult[]> {
|
||||
const { country, nResults = 5 } = options;
|
||||
const collection = await getCollection(COLLECTION_NAMES.SEARCH_RESULTS);
|
||||
const queryEmbedding = await embedSingle(text);
|
||||
|
||||
const whereFilter = country ? { country } : undefined;
|
||||
|
||||
const results = await collection.query({
|
||||
queryEmbeddings: [queryEmbedding],
|
||||
nResults,
|
||||
where: whereFilter,
|
||||
});
|
||||
|
||||
if (!results.ids[0]) return [];
|
||||
|
||||
return results.ids[0].map((id, i) => ({
|
||||
id,
|
||||
churchId: (results.metadatas[0][i]?.churchId as string) || '',
|
||||
distance: results.distances?.[0]?.[i] ?? 1,
|
||||
document: results.documents[0][i] || '',
|
||||
metadata: (results.metadatas[0][i] as Record<string, unknown>) || {},
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify whether a page contains mass schedule information.
|
||||
* Compares against known mass schedule pages in the collection.
|
||||
*/
|
||||
export async function classifyPage(
|
||||
pageText: string,
|
||||
threshold: number = 0.3
|
||||
): Promise<PageClassificationResult> {
|
||||
const collection = await getCollection(COLLECTION_NAMES.PAGE_CLASSIFICATION);
|
||||
const truncatedText = pageText.slice(0, 2000);
|
||||
const queryEmbedding = await embedSingle(truncatedText);
|
||||
|
||||
const results = await collection.query({
|
||||
queryEmbeddings: [queryEmbedding],
|
||||
nResults: 1,
|
||||
where: { isMassSchedulePage: true },
|
||||
});
|
||||
|
||||
const distance = results.distances?.[0]?.[0] ?? 1;
|
||||
const isMassSchedulePage = distance <= threshold;
|
||||
|
||||
return {
|
||||
isMassSchedulePage,
|
||||
confidence: 1 - distance,
|
||||
nearestDocument: results.documents[0]?.[0] || '',
|
||||
nearestMetadata: (results.metadatas[0]?.[0] as Record<string, unknown>) || {},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Find text blocks that look like mass schedule sections.
|
||||
* Helps locate schedule blocks on complex pages.
|
||||
*/
|
||||
export async function findScheduleSections(
|
||||
textBlock: string,
|
||||
options: { language?: string; nResults?: number } = {}
|
||||
): Promise<ScheduleSectionResult[]> {
|
||||
const { language, nResults = 5 } = options;
|
||||
const collection = await getCollection(COLLECTION_NAMES.SCHEDULE_SECTIONS);
|
||||
const queryEmbedding = await embedSingle(textBlock);
|
||||
|
||||
const whereFilter = language ? { language } : undefined;
|
||||
|
||||
const results = await collection.query({
|
||||
queryEmbeddings: [queryEmbedding],
|
||||
nResults,
|
||||
where: whereFilter,
|
||||
});
|
||||
|
||||
if (!results.ids[0]) return [];
|
||||
|
||||
return results.ids[0].map((id, i) => ({
|
||||
id,
|
||||
distance: results.distances?.[0]?.[i] ?? 1,
|
||||
document: results.documents[0][i] || '',
|
||||
metadata: (results.metadatas[0][i] as Record<string, unknown>) || {},
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect if a page has changed meaningfully since the last scrape.
|
||||
* Compares current page text hash with stored snapshot.
|
||||
*/
|
||||
export async function detectPageChanges(
|
||||
churchId: string,
|
||||
pageText: string
|
||||
): Promise<PageChangeResult> {
|
||||
const collection = await getCollection(COLLECTION_NAMES.PAGE_SNAPSHOTS);
|
||||
const currentHash = crypto.createHash('sha256').update(pageText).digest('hex');
|
||||
|
||||
// Look up existing snapshot for this church
|
||||
const existing = await collection.get({
|
||||
where: { churchId },
|
||||
limit: 1,
|
||||
});
|
||||
|
||||
if (!existing.ids.length) {
|
||||
return {
|
||||
hasChanged: true,
|
||||
changeScore: 1,
|
||||
previousHash: null,
|
||||
currentHash,
|
||||
};
|
||||
}
|
||||
|
||||
const previousHash = (existing.metadatas[0]?.scheduleHash as string) || '';
|
||||
|
||||
if (previousHash === currentHash) {
|
||||
return {
|
||||
hasChanged: false,
|
||||
changeScore: 0,
|
||||
previousHash,
|
||||
currentHash,
|
||||
};
|
||||
}
|
||||
|
||||
// Use embedding similarity to measure how much content changed
|
||||
const [currentEmbedding] = await embed([pageText.slice(0, 4000)]);
|
||||
const previousDoc = existing.documents[0] || '';
|
||||
const [previousEmbedding] = await embed([previousDoc.slice(0, 4000)]);
|
||||
|
||||
// Cosine distance (0 = same, 2 = opposite)
|
||||
let dotProduct = 0;
|
||||
let normA = 0;
|
||||
let normB = 0;
|
||||
for (let i = 0; i < currentEmbedding.length; i++) {
|
||||
dotProduct += currentEmbedding[i] * previousEmbedding[i];
|
||||
normA += currentEmbedding[i] ** 2;
|
||||
normB += previousEmbedding[i] ** 2;
|
||||
}
|
||||
const cosineSimilarity = dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
||||
const changeScore = Math.max(0, Math.min(1, 1 - cosineSimilarity));
|
||||
|
||||
return {
|
||||
hasChanged: true,
|
||||
changeScore,
|
||||
previousHash,
|
||||
currentHash,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Upsert a page snapshot into the page_snapshots collection.
|
||||
* Call after a successful scrape.
|
||||
*/
|
||||
export async function upsertPageSnapshot(
|
||||
churchId: string,
|
||||
url: string,
|
||||
pageText: string
|
||||
): Promise<void> {
|
||||
const collection = await getCollection(COLLECTION_NAMES.PAGE_SNAPSHOTS);
|
||||
const scheduleHash = crypto.createHash('sha256').update(pageText).digest('hex');
|
||||
const truncatedText = pageText.slice(0, 8000); // Limit stored text size
|
||||
const [embedding] = await embed([truncatedText]);
|
||||
|
||||
await collection.upsert({
|
||||
ids: [`snapshot-${churchId}`],
|
||||
embeddings: [embedding],
|
||||
documents: [truncatedText],
|
||||
metadatas: [{
|
||||
churchId,
|
||||
url,
|
||||
scrapeDate: new Date().toISOString(),
|
||||
scheduleHash,
|
||||
}],
|
||||
});
|
||||
}
|
||||
22
src/lib/admin-auth.ts
Normal file
22
src/lib/admin-auth.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
import { timingSafeEqual } from 'crypto';
|
||||
|
||||
export function validateAdminApiKey(request: NextRequest): boolean {
|
||||
const apiKey = request.headers.get('x-api-key');
|
||||
const expectedKey = process.env.ADMIN_API_KEY;
|
||||
|
||||
if (!expectedKey || !apiKey) {
|
||||
if (!expectedKey) console.warn('ADMIN_API_KEY not configured');
|
||||
return false;
|
||||
}
|
||||
|
||||
if (apiKey.length !== expectedKey.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return timingSafeEqual(Buffer.from(apiKey), Buffer.from(expectedKey));
|
||||
}
|
||||
|
||||
export function unauthorizedResponse() {
|
||||
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 });
|
||||
}
|
||||
323
src/lib/baidu-client.ts
Normal file
323
src/lib/baidu-client.ts
Normal file
@@ -0,0 +1,323 @@
|
||||
/**
|
||||
* Baidu Maps API Client for importing Catholic churches in China
|
||||
* Uses Place Search API v2 with grid-based search strategy
|
||||
*
|
||||
* Coordinate system: Baidu returns BD-09 → convert to GCJ-02 → convert to WGS-84
|
||||
*/
|
||||
|
||||
// @ts-expect-error coordtransform has no type declarations
|
||||
import coordtransform from 'coordtransform';
|
||||
|
||||
const BAIDU_API_URL = 'https://api.map.baidu.com/place/v2/search';
|
||||
|
||||
// Search queries for Catholic churches in Chinese
|
||||
const SEARCH_QUERIES = ['天主教堂', '天主堂'];
|
||||
|
||||
// China bounding box
|
||||
const CHINA_BOUNDS = {
|
||||
latMin: 18.0,
|
||||
latMax: 54.0,
|
||||
lngMin: 73.0,
|
||||
lngMax: 135.0,
|
||||
};
|
||||
|
||||
// Grid cell size in degrees
|
||||
const CELL_SIZE = 2.0;
|
||||
const SUB_CELL_SIZE = 1.0;
|
||||
|
||||
// Baidu API limits
|
||||
const PAGE_SIZE = 20;
|
||||
const MAX_PAGES = 20; // 20 pages × 20 results = 400 max per query per cell
|
||||
const RATE_LIMIT_MS = 200; // ~5 QPS
|
||||
|
||||
export interface BaiduChurch {
|
||||
baiduId: string;
|
||||
name: string;
|
||||
lat: number; // WGS-84
|
||||
lng: number; // WGS-84
|
||||
address?: string;
|
||||
city?: string;
|
||||
province?: string;
|
||||
phone?: string;
|
||||
website?: string;
|
||||
}
|
||||
|
||||
export interface GridCell {
|
||||
index: number;
|
||||
south: number;
|
||||
west: number;
|
||||
north: number;
|
||||
east: number;
|
||||
}
|
||||
|
||||
interface BaiduApiResult {
|
||||
uid: string;
|
||||
name: string;
|
||||
location: { lat: number; lng: number };
|
||||
address?: string;
|
||||
province?: string;
|
||||
city?: string;
|
||||
telephone?: string;
|
||||
detail_info?: {
|
||||
tag?: string;
|
||||
overall_rating?: string;
|
||||
detail_url?: string;
|
||||
};
|
||||
}
|
||||
|
||||
interface BaiduApiResponse {
|
||||
status: number;
|
||||
message: string;
|
||||
total: number;
|
||||
results: BaiduApiResult[];
|
||||
}
|
||||
|
||||
let lastRequestTime = 0;
|
||||
|
||||
async function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function enforceRateLimit(): Promise<void> {
|
||||
const now = Date.now();
|
||||
const timeSinceLastRequest = now - lastRequestTime;
|
||||
if (timeSinceLastRequest < RATE_LIMIT_MS) {
|
||||
await delay(RATE_LIMIT_MS - timeSinceLastRequest);
|
||||
}
|
||||
lastRequestTime = Date.now();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert BD-09 coordinates to WGS-84
|
||||
*/
|
||||
function bd09ToWgs84(bdLng: number, bdLat: number): { lat: number; lng: number } {
|
||||
// BD-09 → GCJ-02
|
||||
const gcj = coordtransform.bd09togcj02(bdLng, bdLat) as [number, number];
|
||||
// GCJ-02 → WGS-84
|
||||
const wgs = coordtransform.gcj02towgs84(gcj[0], gcj[1]) as [number, number];
|
||||
return { lng: wgs[0], lat: wgs[1] };
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate grid cells covering China
|
||||
*/
|
||||
export function generateGridCells(): GridCell[] {
|
||||
const cells: GridCell[] = [];
|
||||
let index = 0;
|
||||
|
||||
for (let lat = CHINA_BOUNDS.latMin; lat < CHINA_BOUNDS.latMax; lat += CELL_SIZE) {
|
||||
for (let lng = CHINA_BOUNDS.lngMin; lng < CHINA_BOUNDS.lngMax; lng += CELL_SIZE) {
|
||||
cells.push({
|
||||
index,
|
||||
south: lat,
|
||||
west: lng,
|
||||
north: Math.min(lat + CELL_SIZE, CHINA_BOUNDS.latMax),
|
||||
east: Math.min(lng + CELL_SIZE, CHINA_BOUNDS.lngMax),
|
||||
});
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
return cells;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate sub-cells for a cell that has too many results
|
||||
*/
|
||||
function subdivideCell(cell: GridCell): GridCell[] {
|
||||
const subCells: GridCell[] = [];
|
||||
let index = 0;
|
||||
|
||||
for (let lat = cell.south; lat < cell.north; lat += SUB_CELL_SIZE) {
|
||||
for (let lng = cell.west; lng < cell.east; lng += SUB_CELL_SIZE) {
|
||||
subCells.push({
|
||||
index: index++,
|
||||
south: lat,
|
||||
west: lng,
|
||||
north: Math.min(lat + SUB_CELL_SIZE, cell.north),
|
||||
east: Math.min(lng + SUB_CELL_SIZE, cell.east),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return subCells;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search Baidu Maps API for a single query in a bounding box
|
||||
*/
|
||||
async function searchBaiduCell(
|
||||
apiKey: string,
|
||||
query: string,
|
||||
bounds: { south: number; west: number; north: number; east: number }
|
||||
): Promise<{ results: BaiduApiResult[]; total: number }> {
|
||||
const allResults: BaiduApiResult[] = [];
|
||||
let total = 0;
|
||||
|
||||
for (let pageNum = 0; pageNum < MAX_PAGES; pageNum++) {
|
||||
await enforceRateLimit();
|
||||
|
||||
const params = new URLSearchParams({
|
||||
query,
|
||||
bounds: `${bounds.south},${bounds.west},${bounds.north},${bounds.east}`,
|
||||
output: 'json',
|
||||
ak: apiKey,
|
||||
scope: '2',
|
||||
page_size: String(PAGE_SIZE),
|
||||
page_num: String(pageNum),
|
||||
});
|
||||
|
||||
const url = `${BAIDU_API_URL}?${params}`;
|
||||
let retries = 3;
|
||||
|
||||
while (retries > 0) {
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as BaiduApiResponse;
|
||||
|
||||
if (data.status !== 0) {
|
||||
// Status 302 = quota exceeded, 401 = invalid AK
|
||||
if (data.status === 302) {
|
||||
console.warn('Baidu API quota exceeded, waiting 60s...');
|
||||
await delay(60000);
|
||||
retries--;
|
||||
continue;
|
||||
}
|
||||
throw new Error(`Baidu API error ${data.status}: ${data.message}`);
|
||||
}
|
||||
|
||||
total = data.total;
|
||||
if (!data.results || data.results.length === 0) {
|
||||
return { results: allResults, total };
|
||||
}
|
||||
|
||||
allResults.push(...data.results);
|
||||
|
||||
// If we've collected all results, stop paginating
|
||||
if (allResults.length >= total) {
|
||||
return { results: allResults, total };
|
||||
}
|
||||
|
||||
break; // Success, move to next page
|
||||
} catch (error) {
|
||||
retries--;
|
||||
if (retries === 0) throw error;
|
||||
console.warn(`Retrying Baidu API request (${retries} retries left):`, (error as Error).message);
|
||||
await delay(2000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { results: allResults, total };
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a Baidu API result into a BaiduChurch
|
||||
*/
|
||||
function parseBaiduResult(result: BaiduApiResult): BaiduChurch {
|
||||
const wgs84 = bd09ToWgs84(result.location.lng, result.location.lat);
|
||||
|
||||
return {
|
||||
baiduId: result.uid,
|
||||
name: result.name,
|
||||
lat: wgs84.lat,
|
||||
lng: wgs84.lng,
|
||||
address: result.address || undefined,
|
||||
city: result.city || undefined,
|
||||
province: result.province || undefined,
|
||||
phone: result.telephone || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
export interface BaiduSearchProgress {
|
||||
cellIndex: number;
|
||||
totalCells: number;
|
||||
churchesFound: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Query Baidu Maps for Catholic churches across all of China using grid search
|
||||
*
|
||||
* @param apiKey Baidu Maps API key (AK)
|
||||
* @param onProgress Optional callback for progress updates
|
||||
* @param resumeFromCell Optional cell index to resume from (for crash recovery)
|
||||
* @returns Deduplicated array of BaiduChurch objects
|
||||
*/
|
||||
export async function queryBaiduByGrid(
|
||||
apiKey: string,
|
||||
onProgress?: (progress: BaiduSearchProgress) => void,
|
||||
resumeFromCell: number = 0
|
||||
): Promise<BaiduChurch[]> {
|
||||
const cells = generateGridCells();
|
||||
const seenIds = new Set<string>();
|
||||
const allChurches: BaiduChurch[] = [];
|
||||
|
||||
console.log(`Searching ${cells.length} grid cells (${CELL_SIZE}° × ${CELL_SIZE}°), starting from cell ${resumeFromCell}`);
|
||||
|
||||
for (const cell of cells) {
|
||||
if (cell.index < resumeFromCell) continue;
|
||||
|
||||
let cellChurches: BaiduApiResult[] = [];
|
||||
let needsSubdivision = false;
|
||||
|
||||
for (const query of SEARCH_QUERIES) {
|
||||
try {
|
||||
const { results, total } = await searchBaiduCell(apiKey, query, cell);
|
||||
cellChurches.push(...results);
|
||||
|
||||
// If total > 400 (API limit), we need to subdivide
|
||||
if (total > PAGE_SIZE * MAX_PAGES) {
|
||||
needsSubdivision = true;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error searching cell ${cell.index} for "${query}":`, (error as Error).message);
|
||||
}
|
||||
}
|
||||
|
||||
// If a cell is too dense, subdivide it
|
||||
if (needsSubdivision) {
|
||||
console.log(`Cell ${cell.index} has too many results, subdividing into ${SUB_CELL_SIZE}° × ${SUB_CELL_SIZE}° sub-cells...`);
|
||||
cellChurches = []; // Reset and re-query with sub-cells
|
||||
|
||||
const subCells = subdivideCell(cell);
|
||||
for (const subCell of subCells) {
|
||||
for (const query of SEARCH_QUERIES) {
|
||||
try {
|
||||
const { results } = await searchBaiduCell(apiKey, query, subCell);
|
||||
cellChurches.push(...results);
|
||||
} catch (error) {
|
||||
console.error(`Error searching sub-cell for "${query}":`, (error as Error).message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate by uid across queries and overlapping cells
|
||||
let newCount = 0;
|
||||
for (const result of cellChurches) {
|
||||
if (!seenIds.has(result.uid)) {
|
||||
seenIds.add(result.uid);
|
||||
allChurches.push(parseBaiduResult(result));
|
||||
newCount++;
|
||||
}
|
||||
}
|
||||
|
||||
if (newCount > 0) {
|
||||
console.log(`Cell ${cell.index}/${cells.length}: +${newCount} churches (total: ${allChurches.length})`);
|
||||
}
|
||||
|
||||
onProgress?.({
|
||||
cellIndex: cell.index,
|
||||
totalCells: cells.length,
|
||||
churchesFound: allChurches.length,
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`\nBaidu search complete: ${allChurches.length} unique churches found`);
|
||||
return allChurches;
|
||||
}
|
||||
197
src/lib/country-normalize.ts
Normal file
197
src/lib/country-normalize.ts
Normal file
@@ -0,0 +1,197 @@
|
||||
/**
|
||||
* Normalize country names/codes to ISO 3166-1 alpha-2.
|
||||
* Used by import scripts and enrichment to ensure consistent country codes.
|
||||
*/
|
||||
|
||||
const COUNTRY_NAME_TO_ISO: Record<string, string> = {
|
||||
// English names
|
||||
'united states': 'US', 'united states of america': 'US', 'usa': 'US',
|
||||
'canada': 'CA',
|
||||
'united kingdom': 'GB', 'great britain': 'GB', 'england': 'GB', 'scotland': 'GB', 'wales': 'GB',
|
||||
'ireland': 'IE',
|
||||
'australia': 'AU',
|
||||
'new zealand': 'NZ',
|
||||
'philippines': 'PH',
|
||||
'france': 'FR',
|
||||
'germany': 'DE',
|
||||
'austria': 'AT',
|
||||
'switzerland': 'CH',
|
||||
'liechtenstein': 'LI',
|
||||
'italy': 'IT',
|
||||
'san marino': 'SM',
|
||||
'vatican city': 'VA', 'holy see': 'VA',
|
||||
'spain': 'ES',
|
||||
'portugal': 'PT',
|
||||
'brazil': 'BR',
|
||||
'mexico': 'MX',
|
||||
'argentina': 'AR',
|
||||
'colombia': 'CO',
|
||||
'chile': 'CL',
|
||||
'peru': 'PE',
|
||||
'ecuador': 'EC',
|
||||
'venezuela': 'VE',
|
||||
'costa rica': 'CR',
|
||||
'panama': 'PA',
|
||||
'guatemala': 'GT',
|
||||
'cuba': 'CU',
|
||||
'honduras': 'HN',
|
||||
'el salvador': 'SV',
|
||||
'nicaragua': 'NI',
|
||||
'bolivia': 'BO',
|
||||
'paraguay': 'PY',
|
||||
'uruguay': 'UY',
|
||||
'dominican republic': 'DO',
|
||||
'puerto rico': 'PR',
|
||||
'poland': 'PL',
|
||||
'czech republic': 'CZ', 'czechia': 'CZ',
|
||||
'slovakia': 'SK',
|
||||
'hungary': 'HU',
|
||||
'croatia': 'HR',
|
||||
'slovenia': 'SI',
|
||||
'romania': 'RO',
|
||||
'bulgaria': 'BG',
|
||||
'serbia': 'RS',
|
||||
'lithuania': 'LT',
|
||||
'latvia': 'LV',
|
||||
'estonia': 'EE',
|
||||
'netherlands': 'NL', 'the netherlands': 'NL', 'holland': 'NL',
|
||||
'belgium': 'BE',
|
||||
'luxembourg': 'LU',
|
||||
'malta': 'MT',
|
||||
'cyprus': 'CY',
|
||||
'greece': 'GR',
|
||||
'albania': 'AL',
|
||||
'north macedonia': 'MK', 'macedonia': 'MK',
|
||||
'bosnia and herzegovina': 'BA', 'bosnia': 'BA',
|
||||
'montenegro': 'ME',
|
||||
'kosovo': 'XK',
|
||||
'ukraine': 'UA',
|
||||
'belarus': 'BY',
|
||||
'moldova': 'MD',
|
||||
'denmark': 'DK',
|
||||
'norway': 'NO',
|
||||
'sweden': 'SE',
|
||||
'finland': 'FI',
|
||||
'iceland': 'IS',
|
||||
'india': 'IN',
|
||||
'south korea': 'KR', 'korea': 'KR',
|
||||
'japan': 'JP',
|
||||
'china': 'CN',
|
||||
'taiwan': 'TW',
|
||||
'vietnam': 'VN',
|
||||
'indonesia': 'ID',
|
||||
'malaysia': 'MY',
|
||||
'singapore': 'SG',
|
||||
'thailand': 'TH',
|
||||
'nigeria': 'NG',
|
||||
'kenya': 'KE',
|
||||
'south africa': 'ZA',
|
||||
'democratic republic of the congo': 'CD', 'congo': 'CD',
|
||||
'tanzania': 'TZ',
|
||||
'uganda': 'UG',
|
||||
'ghana': 'GH',
|
||||
'cameroon': 'CM',
|
||||
'ethiopia': 'ET',
|
||||
'madagascar': 'MG',
|
||||
'mozambique': 'MZ',
|
||||
'angola': 'AO',
|
||||
'rwanda': 'RW',
|
||||
'haiti': 'HT',
|
||||
'jamaica': 'JM',
|
||||
'trinidad and tobago': 'TT',
|
||||
|
||||
// Native language names
|
||||
'deutschland': 'DE',
|
||||
'österreich': 'AT', 'osterreich': 'AT',
|
||||
'schweiz': 'CH', 'suisse': 'CH', 'svizzera': 'CH',
|
||||
'españa': 'ES', 'espana': 'ES',
|
||||
'italia': 'IT',
|
||||
'brasil': 'BR',
|
||||
'méxico': 'MX',
|
||||
'polska': 'PL',
|
||||
'česká republika': 'CZ', 'ceska republika': 'CZ', 'česko': 'CZ', 'cesko': 'CZ',
|
||||
'slovensko': 'SK',
|
||||
'magyarország': 'HU', 'magyarorszag': 'HU',
|
||||
'hrvatska': 'HR',
|
||||
'slovenija': 'SI',
|
||||
'românia': 'RO',
|
||||
'nederland': 'NL',
|
||||
'belgique': 'BE', 'belgië': 'BE', 'belgie': 'BE',
|
||||
'éire': 'IE', 'eire': 'IE',
|
||||
'ελλάδα': 'GR',
|
||||
'україна': 'UA',
|
||||
'беларусь': 'BY',
|
||||
'србија': 'RS',
|
||||
'crna gora': 'ME',
|
||||
'bosna i hercegovina': 'BA',
|
||||
'shqipëri': 'AL', 'shqiperi': 'AL',
|
||||
'северна македонија': 'MK',
|
||||
'lietuva': 'LT',
|
||||
'latvija': 'LV',
|
||||
'eesti': 'EE',
|
||||
'danmark': 'DK',
|
||||
'norge': 'NO',
|
||||
'sverige': 'SE',
|
||||
'suomi': 'FI',
|
||||
'ísland': 'IS', 'island': 'IS',
|
||||
};
|
||||
|
||||
// Valid ISO 3166-1 alpha-2 codes (common ones relevant to the project)
|
||||
const VALID_ISO_CODES = new Set([
|
||||
'US', 'CA', 'GB', 'IE', 'AU', 'NZ', 'PH',
|
||||
'FR', 'DE', 'AT', 'CH', 'LI',
|
||||
'IT', 'SM', 'VA',
|
||||
'ES', 'PT', 'BR', 'MX', 'AR', 'CO', 'CL', 'PE', 'EC', 'VE',
|
||||
'CR', 'PA', 'GT', 'CU', 'HN', 'SV', 'NI', 'BO', 'PY', 'UY', 'DO', 'PR',
|
||||
'PL', 'CZ', 'SK', 'HU', 'HR', 'SI', 'RO', 'BG', 'RS',
|
||||
'LT', 'LV', 'EE',
|
||||
'NL', 'BE', 'LU', 'MT', 'CY',
|
||||
'GR', 'AL', 'MK', 'BA', 'ME', 'XK',
|
||||
'UA', 'BY', 'MD',
|
||||
'DK', 'NO', 'SE', 'FI', 'IS',
|
||||
'IN', 'KR', 'JP', 'CN', 'TW', 'VN', 'ID', 'MY', 'SG', 'TH',
|
||||
'NG', 'KE', 'ZA', 'CD', 'TZ', 'UG', 'GH', 'CM', 'ET', 'MG', 'MZ', 'AO', 'RW',
|
||||
'HT', 'JM', 'TT',
|
||||
]);
|
||||
|
||||
/**
|
||||
* Normalize a country string to ISO 3166-1 alpha-2 code.
|
||||
* Returns the uppercase ISO code, or the original string if unknown.
|
||||
*/
|
||||
export function normalizeCountryCode(country: string): string {
|
||||
if (!country) return country;
|
||||
|
||||
const trimmed = country.trim();
|
||||
|
||||
// Already a valid 2-letter ISO code?
|
||||
const upper = trimmed.toUpperCase();
|
||||
if (upper.length === 2 && VALID_ISO_CODES.has(upper)) {
|
||||
return upper;
|
||||
}
|
||||
|
||||
// 3-letter ISO codes (common ones)
|
||||
const ISO3_TO_ISO2: Record<string, string> = {
|
||||
'USA': 'US', 'CAN': 'CA', 'GBR': 'GB', 'IRL': 'IE', 'AUS': 'AU',
|
||||
'NZL': 'NZ', 'PHL': 'PH', 'FRA': 'FR', 'DEU': 'DE', 'AUT': 'AT',
|
||||
'CHE': 'CH', 'ITA': 'IT', 'ESP': 'ES', 'PRT': 'PT', 'BRA': 'BR',
|
||||
'MEX': 'MX', 'ARG': 'AR', 'COL': 'CO', 'POL': 'PL', 'CZE': 'CZ',
|
||||
'SVK': 'SK', 'HUN': 'HU', 'HRV': 'HR', 'SVN': 'SI', 'ROU': 'RO',
|
||||
'NLD': 'NL', 'BEL': 'BE', 'LUX': 'LU',
|
||||
};
|
||||
if (upper.length === 3 && ISO3_TO_ISO2[upper]) {
|
||||
return ISO3_TO_ISO2[upper];
|
||||
}
|
||||
|
||||
// Look up by name (case-insensitive)
|
||||
const lower = trimmed.toLowerCase();
|
||||
const mapped = COUNTRY_NAME_TO_ISO[lower];
|
||||
if (mapped) return mapped;
|
||||
|
||||
// Try without accents (basic normalization)
|
||||
const noAccents = lower.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
|
||||
const mappedNoAccent = COUNTRY_NAME_TO_ISO[noAccents];
|
||||
if (mappedNoAccent) return mappedNoAccent;
|
||||
|
||||
// Unknown — return original trimmed value
|
||||
return trimmed;
|
||||
}
|
||||
25
src/lib/db.ts
Normal file
25
src/lib/db.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { Pool } from 'pg';
|
||||
import { PrismaPg } from '@prisma/adapter-pg';
|
||||
import { PrismaClient } from '@prisma/client';
|
||||
|
||||
const globalForPrisma = globalThis as unknown as {
|
||||
prisma: PrismaClient | undefined;
|
||||
pool: InstanceType<typeof Pool> | undefined;
|
||||
};
|
||||
|
||||
const connectionString = process.env.DATABASE_URL || '';
|
||||
const pool = globalForPrisma.pool ?? new Pool({ connectionString });
|
||||
|
||||
const adapter = new PrismaPg(pool);
|
||||
|
||||
export const prisma =
|
||||
globalForPrisma.prisma ??
|
||||
new PrismaClient({
|
||||
adapter,
|
||||
log: process.env.NODE_ENV === 'development' ? ['query', 'error', 'warn'] : ['error'],
|
||||
});
|
||||
|
||||
if (process.env.NODE_ENV !== 'production') {
|
||||
globalForPrisma.prisma = prisma;
|
||||
globalForPrisma.pool = pool;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user