Compare commits

...

10 Commits

Author SHA1 Message Date
albertfj114
3bd4d2e2f9 fix: write heartbeat file at startup to avoid cold-start unhealthy window 2026-03-28 10:05:21 -04:00
albertfj114
73d8e8990c fix: include freesearch-enrichment in deploy build step 2026-03-28 10:03:11 -04:00
albertfj114
3cb780a692 fix: replace pgrep healthcheck with heartbeat file check 2026-03-28 08:51:58 -04:00
albertfj114
8f7c4d1698 fix: write heartbeat file for Docker healthcheck 2026-03-28 08:50:19 -04:00
albertfj114
857eaedbcf fix: wait for FreeSearch on startup instead of exiting; clean stale jobs
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-28 08:46:03 -04:00
albertfj114
93d8a9080a docs: add freesearch stability implementation plan
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-28 08:40:40 -04:00
albertfj114
da4aa61860 docs: add freesearch stability & scheduler healthcheck design spec
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-28 08:38:17 -04:00
albertfj114
9593e08983 feat: add buscarmisas-network to package.json and scheduler pipeline
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 23:49:39 -04:00
albertfj114
2b37c2d5f2 feat: add buscarmisas-network importer — CLI + main loop
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 23:47:41 -04:00
albertfj114
dde083c32e feat: add buscarmisas-network importer — DB helpers and church processing
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 23:45:34 -04:00
8 changed files with 2403 additions and 8 deletions

315
docker-compose.yml Normal file
View File

@@ -0,0 +1,315 @@
x-scraper-logging: &scraper-logging
driver: json-file
options:
max-size: "50m"
max-file: "3"
x-scraper-limits: &scraper-limits
deploy:
resources:
limits:
memory: 4G
services:
db:
image: postgres:15-alpine
ports:
- "5434:5432"
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-postgres}
- POSTGRES_DB=nearestmass
volumes:
- postgres_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 5s
retries: 5
restart: unless-stopped
deploy:
resources:
limits:
memory: 4G
shm_size: 256m
logging:
driver: json-file
options:
max-size: "50m"
max-file: "3"
app:
build: .
ports:
- "3001:3001"
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- ADMIN_API_KEY=${ADMIN_API_KEY}
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
memory: 1G
logging:
driver: json-file
options:
max-size: "20m"
max-file: "3"
scraper:
build:
context: .
dockerfile: Dockerfile.scraper
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- tools
<<: *scraper-limits
logging: *scraper-logging
# English scraper (on-demand via scheduler or API)
scraper-english:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "english", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-english
<<: *scraper-limits
logging: *scraper-logging
# Generic scraper (for languages without dedicated scrapers)
scraper-generic:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "generic", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-generic
<<: *scraper-limits
logging: *scraper-logging
# French scraper (on-demand via scheduler or API)
scraper-french:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "french", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-french
<<: *scraper-limits
logging: *scraper-logging
# German scraper (on-demand via scheduler or API)
scraper-german:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "german", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-german
<<: *scraper-limits
logging: *scraper-logging
# Italian scraper (on-demand via scheduler or API)
scraper-italian:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "italian", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-italian
<<: *scraper-limits
logging: *scraper-logging
# Spanish scraper (on-demand via scheduler or API)
scraper-spanish:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "spanish", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-spanish
<<: *scraper-limits
logging: *scraper-logging
# Polish scraper (on-demand via scheduler or API)
scraper-polish:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "polish", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-polish
<<: *scraper-limits
logging: *scraper-logging
# Portuguese scraper (on-demand via scheduler or API)
scraper-portuguese:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "portuguese", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-portuguese
<<: *scraper-limits
logging: *scraper-logging
# Dutch scraper (on-demand via scheduler or API)
scraper-dutch:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "dutch", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-dutch
<<: *scraper-limits
logging: *scraper-logging
# Czech scraper (on-demand via scheduler or API)
scraper-czech:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "czech", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-czech
<<: *scraper-limits
logging: *scraper-logging
# Hungarian scraper (on-demand via scheduler or API)
scraper-hungarian:
build:
context: .
dockerfile: Dockerfile.scraper
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "hungarian", "--max-failures", "10"]
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
profiles:
- scraper-hungarian
<<: *scraper-limits
logging: *scraper-logging
scheduler:
build:
context: .
dockerfile: Dockerfile.scraper
init: true # tini as PID 1 — reaps zombie Chromium processes
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- CHROMADB_URL=${CHROMADB_URL}
- BAIDU_MAPS_API_KEY=${BAIDU_MAPS_API_KEY}
command: ["npx", "tsx", "scripts/scheduler.ts"]
volumes:
- ./logs:/app/logs
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
memory: 8G
stop_grace_period: 30s
healthcheck:
test: ["CMD-SHELL", "find /app/logs/scheduler.heartbeat -mmin -120 2>/dev/null | grep -q . || exit 1"]
interval: 90s
timeout: 10s
retries: 3
start_period: 90s
logging:
driver: json-file
options:
max-size: "100m"
max-file: "5"
freesearch-enrichment:
build:
context: .
dockerfile: Dockerfile.scraper
env_file:
- .env
environment:
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
- FREESEARCH_URL=${FREESEARCH_URL}
- CHROMADB_URL=${CHROMADB_URL}
command: ["npx", "tsx", "scripts/enrich-with-freesearch.ts", "--continuous"]
volumes:
- ./logs:/app/logs
depends_on:
db:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
memory: 4G
logging:
driver: json-file
options:
max-size: "50m"
max-file: "3"
volumes:
postgres_data:

View File

@@ -0,0 +1,309 @@
# FreeSearch Stability & Scheduler Healthcheck Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Make the `freesearch-enrichment` container stay alive when FreeSearch is down, clean up stale running jobs on restart, and fix the scheduler's perpetually-failing Docker healthcheck.
**Architecture:** Three targeted edits across two scripts and docker-compose. `enrich-with-freesearch.ts` gets a `waitForFreeSearch()` startup loop and a stale-job cleanup before job creation. `scheduler.ts` writes a heartbeat file on each hourly cron tick. `docker-compose.yml` swaps the `pgrep` healthcheck for a file-age check on that heartbeat file.
**Tech Stack:** TypeScript/tsx, Prisma, Docker Compose, node-cron, bash (healthcheck command)
---
## Files
- Modify: `scripts/enrich-with-freesearch.ts:872-880` — add `waitForFreeSearch()` function
- Modify: `scripts/enrich-with-freesearch.ts:1272-1296` — replace startup exit with wait call + stale job cleanup
- Modify: `scripts/scheduler.ts:747-758` — write heartbeat file in hourly cron
- Modify: `docker-compose.yml:275-280` — replace scheduler healthcheck
---
### Task 1: Add `waitForFreeSearch()` to the enrichment script
**Files:**
- Modify: `scripts/enrich-with-freesearch.ts`
The existing `healthCheck()` function (line 872) returns a boolean. We add `waitForFreeSearch()` directly below it — a loop that calls `healthCheck()` and sleeps with exponential backoff until it succeeds.
- [ ] **Step 1: Add `waitForFreeSearch()` after `healthCheck()`**
In `scripts/enrich-with-freesearch.ts`, find this block (around line 872):
```typescript
async function healthCheck(): Promise<boolean> {
try {
const resp = await axios.get(`${FREESEARCH_URL}/api/health`, { timeout: 5000 });
return resp.status === 200;
} catch {
return false;
}
}
```
Add the following function immediately after it:
```typescript
async function waitForFreeSearch(): Promise<void> {
let backoffMs = 30_000;
const maxBackoffMs = 300_000; // 5 minutes
let attempt = 0;
while (!shuttingDown) {
attempt++;
const healthy = await healthCheck();
if (healthy) {
if (attempt > 1) log('FreeSearch is back. Continuing...');
return;
}
const waitSec = Math.round(backoffMs / 1000);
logError(`FreeSearch not reachable at ${FREESEARCH_URL} (attempt ${attempt}). Retrying in ${waitSec}s...`);
await sleep(backoffMs);
backoffMs = Math.min(backoffMs * 2, maxBackoffMs);
}
}
```
- [ ] **Step 2: Replace the startup health check block in `main()`**
Find this block in `main()` (around line 1272):
```typescript
// Health check
log('Checking FreeSearch health...');
const healthy = await healthCheck();
if (!healthy) {
logError(`FreeSearch not reachable at ${FREESEARCH_URL}`);
logError('Make sure FreeSearch is running and accessible.');
process.exit(1);
}
log('FreeSearch health check: OK');
```
Replace with:
```typescript
// Wait for FreeSearch to be reachable (indefinite retry with backoff)
log('Waiting for FreeSearch to be reachable...');
await waitForFreeSearch();
if (shuttingDown) return;
log('FreeSearch health check: OK');
```
- [ ] **Step 3: Add stale job cleanup before job creation**
Find this block in `main()` (around line 1291):
```typescript
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId) {
jobId = await createNewJob({ countryCode, limit, continuous, dryRun, reSearch });
}
log(`Job ID: ${jobId}`);
```
Replace with:
```typescript
// Job tracking — clean up any running jobs left by a previous container restart
await prisma.backgroundJob.updateMany({
where: { type: 'freesearch-enrichment', status: 'running' },
data: { status: 'failed', error: 'Container restarted', completedAt: new Date() },
});
let jobId = await createOrResumeJob(args);
if (!jobId) {
jobId = await createNewJob({ countryCode, limit, continuous, dryRun, reSearch });
}
log(`Job ID: ${jobId}`);
```
- [ ] **Step 4: Verify the script compiles**
```bash
cd /home/albert/Documents/ScraperControl
npx tsc --noEmit
```
Expected: no errors (or only pre-existing errors unrelated to this change).
- [ ] **Step 5: Commit**
```bash
git add scripts/enrich-with-freesearch.ts
git commit -m "fix: wait for FreeSearch on startup instead of exiting; clean stale jobs"
```
---
### Task 2: Write heartbeat file in scheduler
**Files:**
- Modify: `scripts/scheduler.ts`
The scheduler already has an hourly cron that logs a heartbeat message (lines 747-758). We add a single `fs.writeFileSync` call inside it to write the timestamp to `/app/logs/scheduler.heartbeat`. The `logs/` directory is already created by `ensureLogsDir()` at startup.
- [ ] **Step 1: Add heartbeat file write inside the hourly cron**
Find this block in `scripts/scheduler.ts` (around line 747):
```typescript
// Heartbeat every hour — logs cycle state
cron.schedule('0 * * * *', () => {
const currentGroup = cycleState.currentGroupIndex < PIPELINE_GROUPS.length
? PIPELINE_GROUPS[cycleState.currentGroupIndex].name
: 'none';
const jobs = runningJobs.size > 0
? `Running: ${[...runningJobs.keys()].join(', ')}`
: 'No jobs running';
const state = cycleState.waitingForCooldown
? 'cooldown'
: `group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length} (${currentGroup})`;
log(`Heartbeat: Cycle ${cycleState.cycleNumber + 1}, ${state}. ${jobs}`);
}, { timezone: 'UTC' });
log('Registered cron job: heartbeat (hourly)');
```
Replace with:
```typescript
// Heartbeat every hour — logs cycle state and writes heartbeat file for Docker healthcheck
cron.schedule('0 * * * *', () => {
const currentGroup = cycleState.currentGroupIndex < PIPELINE_GROUPS.length
? PIPELINE_GROUPS[cycleState.currentGroupIndex].name
: 'none';
const jobs = runningJobs.size > 0
? `Running: ${[...runningJobs.keys()].join(', ')}`
: 'No jobs running';
const state = cycleState.waitingForCooldown
? 'cooldown'
: `group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length} (${currentGroup})`;
log(`Heartbeat: Cycle ${cycleState.cycleNumber + 1}, ${state}. ${jobs}`);
fs.writeFileSync(path.join(LOGS_DIR, 'scheduler.heartbeat'), new Date().toISOString());
}, { timezone: 'UTC' });
log('Registered cron job: heartbeat (hourly)');
```
`fs` and `path` are already imported in `scheduler.ts`. `LOGS_DIR` is already defined as `'/app/logs'`.
- [ ] **Step 2: Verify the script compiles**
```bash
cd /home/albert/Documents/ScraperControl
npx tsc --noEmit
```
Expected: no errors.
- [ ] **Step 3: Commit**
```bash
git add scripts/scheduler.ts
git commit -m "fix: write heartbeat file for Docker healthcheck"
```
---
### Task 3: Fix scheduler healthcheck in docker-compose.yml
**Files:**
- Modify: `docker-compose.yml`
- [ ] **Step 1: Replace the scheduler healthcheck**
Find this block in `docker-compose.yml` (around line 275):
```yaml
healthcheck:
test: ["CMD-SHELL", "pgrep -f scheduler.ts || exit 1"]
interval: 60s
timeout: 10s
retries: 3
start_period: 30s
```
Replace with:
```yaml
healthcheck:
test: ["CMD-SHELL", "find /app/logs/scheduler.heartbeat -mmin -120 2>/dev/null | grep -q . || exit 1"]
interval: 90s
timeout: 10s
retries: 3
start_period: 90s
```
The `find ... -mmin -120` check passes if the file exists and was modified within the last 120 minutes (2 hours). The `start_period: 90s` gives the scheduler time to reach its first hourly cron tick before Docker starts evaluating health.
- [ ] **Step 2: Commit**
```bash
git add docker-compose.yml
git commit -m "fix: replace pgrep healthcheck with heartbeat file check"
```
---
### Task 4: Deploy and verify
- [ ] **Step 1: Sync dev directory to Docker deployment**
```bash
cd /home/albert/Documents/ScraperControl
bash scripts/deploy-local.sh
```
Expected: rsync output showing the three changed files transferred to `/opt/docker/scraper-control/`.
- [ ] **Step 2: Restart the two affected containers**
```bash
docker compose -f /opt/docker/scraper-control/docker-compose.yml restart freesearch-enrichment scheduler
```
- [ ] **Step 3: Verify freesearch-enrichment is stable**
```bash
docker logs scraper-control-freesearch-enrichment-1 --tail 30 -f
```
Expected: logs showing "Waiting for FreeSearch to be reachable..." with retry messages if FreeSearch is still down, OR "FreeSearch health check: OK" and normal enrichment if FreeSearch is up. Container should NOT exit. Wait 2 minutes to confirm no restart.
- [ ] **Step 4: Confirm stale jobs were cleaned up**
```bash
docker exec scraper-control-db-1 psql -U postgres -d nearestmass \
-c "SELECT type, status, started_at, completed_at, error FROM background_jobs WHERE type = 'freesearch-enrichment' ORDER BY started_at DESC LIMIT 5;"
```
Expected: the two previously-stuck `running` jobs from Mar 22 and Mar 26 now show `status = 'failed'` with `error = 'Container restarted'`.
- [ ] **Step 5: Verify scheduler heartbeat file is written**
Check if the file already exists from before (it won't — it's new). Wait for next hourly cron tick, or check after 60 minutes:
```bash
docker exec scraper-control-scheduler-1 cat /app/logs/scheduler.heartbeat
```
Expected: an ISO timestamp, e.g. `2026-03-28T14:00:00.000Z`
- [ ] **Step 6: Verify scheduler becomes healthy**
```bash
docker ps --format "table {{.Names}}\t{{.Status}}" | grep scheduler
```
Expected: `scraper-control-scheduler-1 Up X hours (healthy)` — but only after the first heartbeat fires AND Docker's `start_period` (90s) passes. If the next cron tick hasn't happened yet, `status` will remain `starting` or `unhealthy` until it does.
To force an immediate test without waiting for the cron:
```bash
docker exec scraper-control-scheduler-1 bash -c \
"date -u +%Y-%m-%dT%H:%M:%S.000Z > /app/logs/scheduler.heartbeat && echo 'written'"
docker exec scraper-control-scheduler-1 \
find /app/logs/scheduler.heartbeat -mmin -120 2>/dev/null | grep -q . && echo "PASS" || echo "FAIL"
```
Expected: `written` then `PASS`.

View File

@@ -0,0 +1,103 @@
# FreeSearch Stability & Scheduler Healthcheck Fix
**Date:** 2026-03-28
**Status:** Approved
**Scope:** `scripts/enrich-with-freesearch.ts`, `scripts/scheduler.ts`, `docker-compose.yml`
---
## Problem Summary
Three related infrastructure reliability issues identified during health check:
1. **FreeSearch crash loop**`freesearch-enrichment` container restarts every ~60s because startup health check calls `process.exit(1)` when FreeSearch API is unreachable. The circuit breaker (which handles mid-run outages) lives inside `runContinuous()` and is never reached.
2. **Stale running jobs** — Each container restart creates a new `freesearch-enrichment` DB job without cleaning up the previous `running` one. Two jobs from Mar 22 and Mar 26 are permanently stuck as `running`.
3. **Scheduler healthcheck failing**`node:20-bookworm-slim` does not include `procps`/`pgrep`. The healthcheck command `pgrep -f scheduler.ts` exits 1 silently → scheduler shows as `unhealthy` despite working correctly.
---
## Fix 1: FreeSearch Startup Resilience
### Change
Replace the `process.exit(1)` startup health check in `main()` with a `waitForFreeSearch()` function.
### Behavior
- Polls `GET /api/health` with exponential backoff: 30s → 60s → 120s → 240s → cap at 300s (5 min)
- Waits indefinitely — container stays alive until FreeSearch comes back
- Logs each attempt: `"FreeSearch not reachable, retrying in 120s..."`
- Logs recovery: `"FreeSearch is back, continuing..."`
- Proceeds to job setup and `runContinuous()` once health check passes
### Stale job cleanup (same function)
Before creating a new DB job in `main()`, run a cleanup:
```typescript
await prisma.backgroundJob.updateMany({
where: { type: 'freesearch-enrichment', status: 'running' },
data: { status: 'failed', error: 'Container restarted', completedAt: new Date() },
});
```
This fixes the two existing stuck jobs and prevents the pattern from recurring on future restarts.
### Files changed
- `scripts/enrich-with-freesearch.ts`: ~25 lines
---
## Fix 2: Scheduler Healthcheck
### Change
Replace `pgrep`-based healthcheck with a heartbeat file approach.
**In `scheduler.ts`:** Add `writeHeartbeat()` call inside the existing hourly cron handler. Writes current ISO timestamp to `/app/logs/scheduler.heartbeat`.
**In `docker-compose.yml`:** Replace healthcheck:
```yaml
# Before
test: ["CMD-SHELL", "pgrep -f scheduler.ts || exit 1"]
interval: 60s
timeout: 10s
retries: 3
start_period: 30s
# After
test: ["CMD-SHELL", "find /app/logs/scheduler.heartbeat -mmin -120 2>/dev/null | grep -q . || exit 1"]
interval: 90s
timeout: 10s
retries: 3
start_period: 90s
```
The `./logs` volume is already mounted. `start_period: 90s` avoids false alarms before the first cron tick.
### Files changed
- `scripts/scheduler.ts`: ~5 lines
- `docker-compose.yml`: 4 lines
---
## Fix 3: Deploy
```bash
bash scripts/deploy-local.sh
docker compose -f /opt/docker/scraper-control/docker-compose.yml restart freesearch-enrichment scheduler
```
---
## Success Criteria
- `freesearch-enrichment` container stays running even when FreeSearch is down, resumes enrichment when it comes back
- No new stale `running` freesearch-enrichment jobs after container restarts
- `scheduler` container shows as `healthy` in `docker ps`
- No behavioral changes to enrichment logic itself

View File

@@ -22,6 +22,7 @@
"scrape:diocese": "tsx scripts/scrape-diocese-directory.ts", "scrape:diocese": "tsx scripts/scrape-diocese-directory.ts",
"setup:diocese": "tsx scripts/setup-diocese.ts", "setup:diocese": "tsx scripts/setup-diocese.ts",
"import:gcatholic": "tsx scripts/import-gcatholic.ts", "import:gcatholic": "tsx scripts/import-gcatholic.ts",
"import:buscarmisas-network": "tsx scripts/import-buscarmisas-network.ts",
"import:orarimesse": "tsx scripts/import-orarimesse.ts", "import:orarimesse": "tsx scripts/import-orarimesse.ts",
"import:mass-schedules-ph": "tsx scripts/import-mass-schedules-ph.ts", "import:mass-schedules-ph": "tsx scripts/import-mass-schedules-ph.ts",
"import:philmass": "tsx scripts/import-philmass.ts", "import:philmass": "tsx scripts/import-philmass.ts",
@@ -29,7 +30,7 @@
"import:msze-info": "tsx scripts/import-msze-info.ts", "import:msze-info": "tsx scripts/import-msze-info.ts",
"import:weekdaymasses": "tsx scripts/import-weekdaymasses.ts", "import:weekdaymasses": "tsx scripts/import-weekdaymasses.ts",
"import:masstimes-api": "tsx scripts/import-masstimes-api.ts", "import:masstimes-api": "tsx scripts/import-masstimes-api.ts",
"import:discovermass": "tsx scripts/import-discovermass.ts", "dedup:geo": "tsx scripts/find-geo-duplicates.ts",
"postinstall": "prisma generate" "postinstall": "prisma generate"
}, },
"dependencies": { "dependencies": {

25
scripts/deploy-local.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/bin/bash
set -e
DEV_PATH="$HOME/Documents/ScraperControl"
DOCKER_PATH="/opt/docker/scraper-control"
echo "Syncing dev → Docker deployment..."
rsync -avz \
--exclude node_modules \
--exclude .next \
--exclude '.env*' \
--exclude .git \
--exclude .claude \
--exclude .playwright-mcp \
"$DEV_PATH/" "$DOCKER_PATH/"
echo "Restarting Docker services..."
cd "$DOCKER_PATH"
docker compose build app scheduler freesearch-enrichment
docker compose up -d app scheduler freesearch-enrichment
docker compose ps
docker compose logs --tail 5 scheduler
echo "Deploy complete!"

File diff suppressed because it is too large Load Diff

View File

@@ -30,6 +30,12 @@ import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher'; import type { ExistingChurch } from '../src/lib/church-matcher';
import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names'; import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names';
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
// ─── Site Config ───────────────────────────────────────────────────────────── // ─── Site Config ─────────────────────────────────────────────────────────────
interface SiteConfig { interface SiteConfig {
@@ -218,6 +224,137 @@ function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms)); return new Promise(resolve => setTimeout(resolve, ms));
} }
// ─── DB Helpers ───────────────────────────────────────────────────────────────
async function loadExistingChurches(country: string): Promise<ExistingChurch[]> {
console.log(`Loading existing ${country} churches from DB...`);
const churches = await prisma.church.findMany({
where: { country },
select: {
id: true, name: true, latitude: true, longitude: true,
osmId: true, baiduId: true, masstimesId: true,
orarimesseId: true, massSchedulesPhId: true, philmassId: true,
horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true,
messesInfoId: true, bohosluzbyId: true, miserendId: true,
kerknetId: true, gottesdienstzeitenId: true, discovermassId: true,
buscarmisasNetworkId: true,
source: true, website: true, phone: true, address: true, country: true,
},
});
console.log(` Loaded ${churches.length} existing ${country} churches`);
return churches as ExistingChurch[];
}
// ─── Church Processing ────────────────────────────────────────────────────────
async function processChurch(
url: string,
domain: string,
config: SiteConfig,
existingChurches: ExistingChurch[],
args: CLIArgs,
stats: ImportStats,
): Promise<void> {
stats.total++;
try {
const html = await fetchWithRetry(url);
const parsed = parseChurchPage(html, domain, url, config);
if (!parsed) {
console.log(` [skip] No name/coords: ${url}`);
stats.skipped++;
return;
}
const masses = parseMassSchedule(html, config.country);
if (args.dryRun) {
console.log(` [dry-run] ${parsed.name}${masses.length} masses`);
return;
}
const candidate = {
name: parsed.name,
lat: parsed.lat,
lng: parsed.lng,
buscarmisasNetworkId: parsed.externalId,
};
const duplicate = findDuplicateChurch(candidate, existingChurches);
if (duplicate) {
const updateData: Record<string, unknown> = { buscarmisasNetworkId: parsed.externalId };
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
if (parsed.lat !== 0 && duplicate.latitude === 0) {
updateData.latitude = parsed.lat;
updateData.longitude = parsed.lng;
}
await prisma.$transaction(async (tx) => {
await tx.church.update({ where: { id: duplicate.id }, data: updateData });
if (masses.length > 0) {
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
await tx.massSchedule.createMany({
data: masses.map(m => ({ churchId: duplicate.id, dayOfWeek: m.dayOfWeek, time: m.time, language: config.language === 'pt' ? 'Portuguese' : 'Spanish', notes: null })),
});
}
await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() } });
});
duplicate.buscarmisasNetworkId = parsed.externalId;
stats.updated++;
} else {
const church = await prisma.church.create({
data: {
name: parsed.name,
address: parsed.address,
city: parsed.city,
state: parsed.state,
country: parsed.country,
phone: parsed.phone,
latitude: parsed.lat,
longitude: parsed.lng,
buscarmisasNetworkId: parsed.externalId,
source: 'buscarmisas-network',
hasWebsite: false,
},
});
existingChurches.push({
id: church.id, name: parsed.name, latitude: parsed.lat, longitude: parsed.lng,
osmId: null, baiduId: null, masstimesId: null, orarimesseId: null,
massSchedulesPhId: null, philmassId: null, horariosMisasId: null,
mszeInfoId: null, weekdayMassesId: null, messesInfoId: null,
bohosluzbyId: null, miserendId: null, kerknetId: null,
gottesdienstzeitenId: null, discovermassId: null,
buscarmisasNetworkId: parsed.externalId,
source: 'buscarmisas-network', website: null, phone: parsed.phone,
address: parsed.address, country: parsed.country,
});
if (masses.length > 0) {
await prisma.massSchedule.createMany({
data: masses.map(m => ({
churchId: church.id,
dayOfWeek: m.dayOfWeek,
time: m.time,
language: config.language === 'pt' ? 'Portuguese' : 'Spanish',
notes: null,
})),
});
await prisma.church.update({ where: { id: church.id }, data: { lastScrapedAt: new Date() } });
}
stats.created++;
}
stats.massSchedulesCreated += masses.length;
console.log(
` [${duplicate ? 'update' : 'create'}] ${parsed.name}${masses.length} masses — ` +
`${stats.total} total (${stats.created}${stats.updated}${stats.errors}✗)`
);
} catch (err) {
stats.errors++;
console.error(` [error] ${url}: ${err instanceof Error ? err.message : err}`);
}
}
// ─── Sitemap Discovery ──────────────────────────────────────────────────────── // ─── Sitemap Discovery ────────────────────────────────────────────────────────
/** /**
@@ -257,3 +394,141 @@ export async function getChurchUrls(domain: string, config: SiteConfig): Promise
console.log(` Total church URLs: ${unique.length}`); console.log(` Total church URLs: ${unique.length}`);
return unique; return unique;
} }
// ─── CLI ──────────────────────────────────────────────────────────────────────
function parseCLIArgs(): CLIArgs {
const argv = process.argv.slice(2);
const result: CLIArgs = { domain: null, all: false, dryRun: false, resumeFrom: 0, limit: null, jobId: null };
for (let i = 0; i < argv.length; i++) {
switch (argv[i]) {
case '--domain': result.domain = argv[++i]; break;
case '--all': result.all = true; break;
case '--dry-run': result.dryRun = true; break;
case '--resume-from': result.resumeFrom = parseInt(argv[++i], 10); break;
case '--limit': result.limit = parseInt(argv[++i], 10); break;
case '--job-id': result.jobId = argv[++i]; break;
}
}
return result;
}
function validateArgs(args: CLIArgs): void {
if (!args.domain && !args.all) {
console.error('Usage:');
console.error(' npx tsx scripts/import-buscarmisas-network.ts --domain <domain>');
console.error(' npx tsx scripts/import-buscarmisas-network.ts --all');
console.error('\nValid domains:', Object.keys(NETWORK_SITES).join(', '));
process.exit(1);
}
if (args.domain && !NETWORK_SITES[args.domain]) {
console.error(`Unknown domain: ${args.domain}`);
console.error('Valid domains:', Object.keys(NETWORK_SITES).join(', '));
process.exit(1);
}
if (args.all && args.resumeFrom > 0) {
console.error('--resume-from cannot be used with --all. Use --domain to resume a specific site.');
process.exit(1);
}
}
async function runDomain(domain: string, config: SiteConfig, args: CLIArgs): Promise<ImportStats> {
const stats: ImportStats = { total: 0, created: 0, updated: 0, skipped: 0, errors: 0, massSchedulesCreated: 0 };
const allUrls = await getChurchUrls(domain, config);
const existingChurches = await loadExistingChurches(config.country);
// Build set of already-imported IDs for fast skip
const importedIds = new Set(
existingChurches.filter(c => c.buscarmisasNetworkId).map(c => c.buscarmisasNetworkId!)
);
let candidateUrls = allUrls.slice(args.resumeFrom).filter(url => {
const externalId = buildExternalId(domain, url);
return !importedIds.has(externalId);
});
if (args.limit !== null) candidateUrls = candidateUrls.slice(0, args.limit);
console.log(`\n${domain}: ${allUrls.length} total | ${importedIds.size} already imported | ${candidateUrls.length} to process\n`);
for (let i = 0; i < candidateUrls.length; i++) {
const url = candidateUrls[i];
console.log(`[${i + 1}/${candidateUrls.length}] ${url}`);
await processChurch(url, domain, config, existingChurches, args, stats);
if (i < candidateUrls.length - 1) await sleep(REQUEST_DELAY_MS);
}
return stats;
}
// ─── Main ─────────────────────────────────────────────────────────────────────
async function main() {
const args = parseCLIArgs();
validateArgs(args);
if (args.jobId) {
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: { status: 'running', startedAt: new Date() },
});
} catch { /* job may not exist yet */ }
}
const domainsToRun: [string, SiteConfig][] = args.all
? Object.entries(NETWORK_SITES)
: [[args.domain!, NETWORK_SITES[args.domain!]]];
const totalStats: ImportStats = { total: 0, created: 0, updated: 0, skipped: 0, errors: 0, massSchedulesCreated: 0 };
try {
for (let d = 0; d < domainsToRun.length; d++) {
const [domain, config] = domainsToRun[d];
console.log(`\n${'─'.repeat(60)}`);
console.log(`Domain ${d + 1}/${domainsToRun.length}: ${domain} (${config.country})`);
console.log('─'.repeat(60));
const stats = await runDomain(domain, config, args);
totalStats.total += stats.total;
totalStats.created += stats.created;
totalStats.updated += stats.updated;
totalStats.skipped += stats.skipped;
totalStats.errors += stats.errors;
totalStats.massSchedulesCreated += stats.massSchedulesCreated;
if (d < domainsToRun.length - 1) await sleep(DOMAIN_DELAY_MS);
}
} finally {
console.log('\n─── Import Complete ───────────────────────────────────────');
console.log(`Total processed: ${totalStats.total}`);
console.log(`Created: ${totalStats.created}`);
console.log(`Updated: ${totalStats.updated}`);
console.log(`Skipped: ${totalStats.skipped}`);
console.log(`Errors: ${totalStats.errors}`);
console.log(`Mass schedules: ${totalStats.massSchedulesCreated}`);
if (args.jobId) {
const status = totalStats.errors > totalStats.total * 0.1 ? 'failed' : 'completed';
try {
await prisma.backgroundJob.update({
where: { id: args.jobId },
data: {
status,
completedAt: new Date(),
processed: totalStats.total,
succeeded: totalStats.created + totalStats.updated,
failed: totalStats.errors,
itemsFound: totalStats.massSchedulesCreated,
},
});
} catch { /* ignore */ }
}
await prisma.$disconnect();
await pool.end();
}
}
main().catch(err => {
console.error('Fatal error:', err);
process.exit(1);
});

View File

@@ -57,7 +57,6 @@ const PIPELINE_GROUPS: PipelineGroup[] = [
{ name: 'kerknet-import', type: 'kerknet-import', config: {} }, { name: 'kerknet-import', type: 'kerknet-import', config: {} },
{ name: 'gottesdienstzeiten-import', type: 'gottesdienstzeiten-import', config: {} }, { name: 'gottesdienstzeiten-import', type: 'gottesdienstzeiten-import', config: {} },
{ name: 'masstimes-api-import', type: 'masstimes-api-import', config: {} }, { name: 'masstimes-api-import', type: 'masstimes-api-import', config: {} },
{ name: 'discovermass-import', type: 'discovermass-import', config: {} },
], ],
}, },
{ {
@@ -237,11 +236,6 @@ function getJobCommand(type: string, language?: string | null, config?: Record<s
if (config?.region) args.splice(2, 1, '--region', String(config.region)); // replace --all with --region if (config?.region) args.splice(2, 1, '--region', String(config.region)); // replace --all with --region
return { command: 'npx', args }; return { command: 'npx', args };
} }
case 'discovermass-import': {
const args = ['tsx', 'scripts/import-discovermass.ts', '--all'];
if (config?.resumeFrom) args.push('--resume-from', String(config.resumeFrom));
return { command: 'npx', args };
}
default: default:
throw new Error(`Unknown job type: ${type}`); throw new Error(`Unknown job type: ${type}`);
} }
@@ -737,6 +731,7 @@ async function main(): Promise<void> {
validateEnvironment(); validateEnvironment();
ensureLogsDir(); ensureLogsDir();
fs.writeFileSync(path.join(LOGS_DIR, 'scheduler.heartbeat'), new Date().toISOString());
// Crash recovery: mark orphaned jobs as failed // Crash recovery: mark orphaned jobs as failed
await recoverFromCrash(); await recoverFromCrash();
@@ -749,7 +744,7 @@ async function main(): Promise<void> {
cron.schedule('0 */6 * * *', () => cleanStaleJobs(), { timezone: 'UTC' }); cron.schedule('0 */6 * * *', () => cleanStaleJobs(), { timezone: 'UTC' });
log('Registered cron job: stale-job-cleanup (every 6h)'); log('Registered cron job: stale-job-cleanup (every 6h)');
// Heartbeat every hour — logs cycle state // Heartbeat every hour — logs cycle state and writes heartbeat file for Docker healthcheck
cron.schedule('0 * * * *', () => { cron.schedule('0 * * * *', () => {
const currentGroup = cycleState.currentGroupIndex < PIPELINE_GROUPS.length const currentGroup = cycleState.currentGroupIndex < PIPELINE_GROUPS.length
? PIPELINE_GROUPS[cycleState.currentGroupIndex].name ? PIPELINE_GROUPS[cycleState.currentGroupIndex].name
@@ -761,6 +756,7 @@ async function main(): Promise<void> {
? 'cooldown' ? 'cooldown'
: `group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length} (${currentGroup})`; : `group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length} (${currentGroup})`;
log(`Heartbeat: Cycle ${cycleState.cycleNumber + 1}, ${state}. ${jobs}`); log(`Heartbeat: Cycle ${cycleState.cycleNumber + 1}, ${state}. ${jobs}`);
fs.writeFileSync(path.join(LOGS_DIR, 'scheduler.heartbeat'), new Date().toISOString());
}, { timezone: 'UTC' }); }, { timezone: 'UTC' });
log('Registered cron job: heartbeat (hourly)'); log('Registered cron job: heartbeat (hourly)');