Compare commits
10 Commits
5c7bc4cfed
...
3bd4d2e2f9
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3bd4d2e2f9 | ||
|
|
73d8e8990c | ||
|
|
3cb780a692 | ||
|
|
8f7c4d1698 | ||
|
|
857eaedbcf | ||
|
|
93d8a9080a | ||
|
|
da4aa61860 | ||
|
|
9593e08983 | ||
|
|
2b37c2d5f2 | ||
|
|
dde083c32e |
315
docker-compose.yml
Normal file
315
docker-compose.yml
Normal file
@@ -0,0 +1,315 @@
|
||||
x-scraper-logging: &scraper-logging
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: "50m"
|
||||
max-file: "3"
|
||||
|
||||
x-scraper-limits: &scraper-limits
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
|
||||
services:
|
||||
db:
|
||||
image: postgres:15-alpine
|
||||
ports:
|
||||
- "5434:5432"
|
||||
environment:
|
||||
- POSTGRES_USER=postgres
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-postgres}
|
||||
- POSTGRES_DB=nearestmass
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U postgres"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
restart: unless-stopped
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
shm_size: 256m
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: "50m"
|
||||
max-file: "3"
|
||||
|
||||
app:
|
||||
build: .
|
||||
ports:
|
||||
- "3001:3001"
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- ADMIN_API_KEY=${ADMIN_API_KEY}
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: "20m"
|
||||
max-file: "3"
|
||||
|
||||
scraper:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- tools
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# English scraper (on-demand via scheduler or API)
|
||||
scraper-english:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "english", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-english
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# Generic scraper (for languages without dedicated scrapers)
|
||||
scraper-generic:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "generic", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-generic
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# French scraper (on-demand via scheduler or API)
|
||||
scraper-french:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "french", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-french
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# German scraper (on-demand via scheduler or API)
|
||||
scraper-german:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "german", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-german
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# Italian scraper (on-demand via scheduler or API)
|
||||
scraper-italian:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "italian", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-italian
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# Spanish scraper (on-demand via scheduler or API)
|
||||
scraper-spanish:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "spanish", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-spanish
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# Polish scraper (on-demand via scheduler or API)
|
||||
scraper-polish:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "polish", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-polish
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# Portuguese scraper (on-demand via scheduler or API)
|
||||
scraper-portuguese:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "portuguese", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-portuguese
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# Dutch scraper (on-demand via scheduler or API)
|
||||
scraper-dutch:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "dutch", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-dutch
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# Czech scraper (on-demand via scheduler or API)
|
||||
scraper-czech:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "czech", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-czech
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
# Hungarian scraper (on-demand via scheduler or API)
|
||||
scraper-hungarian:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
command: ["npx", "tsx", "scripts/scrape-churches.ts", "--all", "--language", "hungarian", "--max-failures", "10"]
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
profiles:
|
||||
- scraper-hungarian
|
||||
<<: *scraper-limits
|
||||
logging: *scraper-logging
|
||||
|
||||
scheduler:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
init: true # tini as PID 1 — reaps zombie Chromium processes
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
- BAIDU_MAPS_API_KEY=${BAIDU_MAPS_API_KEY}
|
||||
command: ["npx", "tsx", "scripts/scheduler.ts"]
|
||||
volumes:
|
||||
- ./logs:/app/logs
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G
|
||||
stop_grace_period: 30s
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "find /app/logs/scheduler.heartbeat -mmin -120 2>/dev/null | grep -q . || exit 1"]
|
||||
interval: 90s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 90s
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: "100m"
|
||||
max-file: "5"
|
||||
|
||||
freesearch-enrichment:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.scraper
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://postgres:postgres@db:5432/nearestmass
|
||||
- FREESEARCH_URL=${FREESEARCH_URL}
|
||||
- CHROMADB_URL=${CHROMADB_URL}
|
||||
command: ["npx", "tsx", "scripts/enrich-with-freesearch.ts", "--continuous"]
|
||||
volumes:
|
||||
- ./logs:/app/logs
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: "50m"
|
||||
max-file: "3"
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
309
docs/superpowers/plans/2026-03-28-freesearch-stability.md
Normal file
309
docs/superpowers/plans/2026-03-28-freesearch-stability.md
Normal file
@@ -0,0 +1,309 @@
|
||||
# FreeSearch Stability & Scheduler Healthcheck Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Make the `freesearch-enrichment` container stay alive when FreeSearch is down, clean up stale running jobs on restart, and fix the scheduler's perpetually-failing Docker healthcheck.
|
||||
|
||||
**Architecture:** Three targeted edits across two scripts and docker-compose. `enrich-with-freesearch.ts` gets a `waitForFreeSearch()` startup loop and a stale-job cleanup before job creation. `scheduler.ts` writes a heartbeat file on each hourly cron tick. `docker-compose.yml` swaps the `pgrep` healthcheck for a file-age check on that heartbeat file.
|
||||
|
||||
**Tech Stack:** TypeScript/tsx, Prisma, Docker Compose, node-cron, bash (healthcheck command)
|
||||
|
||||
---
|
||||
|
||||
## Files
|
||||
|
||||
- Modify: `scripts/enrich-with-freesearch.ts:872-880` — add `waitForFreeSearch()` function
|
||||
- Modify: `scripts/enrich-with-freesearch.ts:1272-1296` — replace startup exit with wait call + stale job cleanup
|
||||
- Modify: `scripts/scheduler.ts:747-758` — write heartbeat file in hourly cron
|
||||
- Modify: `docker-compose.yml:275-280` — replace scheduler healthcheck
|
||||
|
||||
---
|
||||
|
||||
### Task 1: Add `waitForFreeSearch()` to the enrichment script
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/enrich-with-freesearch.ts`
|
||||
|
||||
The existing `healthCheck()` function (line 872) returns a boolean. We add `waitForFreeSearch()` directly below it — a loop that calls `healthCheck()` and sleeps with exponential backoff until it succeeds.
|
||||
|
||||
- [ ] **Step 1: Add `waitForFreeSearch()` after `healthCheck()`**
|
||||
|
||||
In `scripts/enrich-with-freesearch.ts`, find this block (around line 872):
|
||||
|
||||
```typescript
|
||||
async function healthCheck(): Promise<boolean> {
|
||||
try {
|
||||
const resp = await axios.get(`${FREESEARCH_URL}/api/health`, { timeout: 5000 });
|
||||
return resp.status === 200;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Add the following function immediately after it:
|
||||
|
||||
```typescript
|
||||
async function waitForFreeSearch(): Promise<void> {
|
||||
let backoffMs = 30_000;
|
||||
const maxBackoffMs = 300_000; // 5 minutes
|
||||
let attempt = 0;
|
||||
|
||||
while (!shuttingDown) {
|
||||
attempt++;
|
||||
const healthy = await healthCheck();
|
||||
if (healthy) {
|
||||
if (attempt > 1) log('FreeSearch is back. Continuing...');
|
||||
return;
|
||||
}
|
||||
const waitSec = Math.round(backoffMs / 1000);
|
||||
logError(`FreeSearch not reachable at ${FREESEARCH_URL} (attempt ${attempt}). Retrying in ${waitSec}s...`);
|
||||
await sleep(backoffMs);
|
||||
backoffMs = Math.min(backoffMs * 2, maxBackoffMs);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Replace the startup health check block in `main()`**
|
||||
|
||||
Find this block in `main()` (around line 1272):
|
||||
|
||||
```typescript
|
||||
// Health check
|
||||
log('Checking FreeSearch health...');
|
||||
const healthy = await healthCheck();
|
||||
if (!healthy) {
|
||||
logError(`FreeSearch not reachable at ${FREESEARCH_URL}`);
|
||||
logError('Make sure FreeSearch is running and accessible.');
|
||||
process.exit(1);
|
||||
}
|
||||
log('FreeSearch health check: OK');
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```typescript
|
||||
// Wait for FreeSearch to be reachable (indefinite retry with backoff)
|
||||
log('Waiting for FreeSearch to be reachable...');
|
||||
await waitForFreeSearch();
|
||||
if (shuttingDown) return;
|
||||
log('FreeSearch health check: OK');
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Add stale job cleanup before job creation**
|
||||
|
||||
Find this block in `main()` (around line 1291):
|
||||
|
||||
```typescript
|
||||
// Job tracking
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob({ countryCode, limit, continuous, dryRun, reSearch });
|
||||
}
|
||||
log(`Job ID: ${jobId}`);
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```typescript
|
||||
// Job tracking — clean up any running jobs left by a previous container restart
|
||||
await prisma.backgroundJob.updateMany({
|
||||
where: { type: 'freesearch-enrichment', status: 'running' },
|
||||
data: { status: 'failed', error: 'Container restarted', completedAt: new Date() },
|
||||
});
|
||||
|
||||
let jobId = await createOrResumeJob(args);
|
||||
if (!jobId) {
|
||||
jobId = await createNewJob({ countryCode, limit, continuous, dryRun, reSearch });
|
||||
}
|
||||
log(`Job ID: ${jobId}`);
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Verify the script compiles**
|
||||
|
||||
```bash
|
||||
cd /home/albert/Documents/ScraperControl
|
||||
npx tsc --noEmit
|
||||
```
|
||||
|
||||
Expected: no errors (or only pre-existing errors unrelated to this change).
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/enrich-with-freesearch.ts
|
||||
git commit -m "fix: wait for FreeSearch on startup instead of exiting; clean stale jobs"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Write heartbeat file in scheduler
|
||||
|
||||
**Files:**
|
||||
- Modify: `scripts/scheduler.ts`
|
||||
|
||||
The scheduler already has an hourly cron that logs a heartbeat message (lines 747-758). We add a single `fs.writeFileSync` call inside it to write the timestamp to `/app/logs/scheduler.heartbeat`. The `logs/` directory is already created by `ensureLogsDir()` at startup.
|
||||
|
||||
- [ ] **Step 1: Add heartbeat file write inside the hourly cron**
|
||||
|
||||
Find this block in `scripts/scheduler.ts` (around line 747):
|
||||
|
||||
```typescript
|
||||
// Heartbeat every hour — logs cycle state
|
||||
cron.schedule('0 * * * *', () => {
|
||||
const currentGroup = cycleState.currentGroupIndex < PIPELINE_GROUPS.length
|
||||
? PIPELINE_GROUPS[cycleState.currentGroupIndex].name
|
||||
: 'none';
|
||||
const jobs = runningJobs.size > 0
|
||||
? `Running: ${[...runningJobs.keys()].join(', ')}`
|
||||
: 'No jobs running';
|
||||
const state = cycleState.waitingForCooldown
|
||||
? 'cooldown'
|
||||
: `group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length} (${currentGroup})`;
|
||||
log(`Heartbeat: Cycle ${cycleState.cycleNumber + 1}, ${state}. ${jobs}`);
|
||||
}, { timezone: 'UTC' });
|
||||
log('Registered cron job: heartbeat (hourly)');
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```typescript
|
||||
// Heartbeat every hour — logs cycle state and writes heartbeat file for Docker healthcheck
|
||||
cron.schedule('0 * * * *', () => {
|
||||
const currentGroup = cycleState.currentGroupIndex < PIPELINE_GROUPS.length
|
||||
? PIPELINE_GROUPS[cycleState.currentGroupIndex].name
|
||||
: 'none';
|
||||
const jobs = runningJobs.size > 0
|
||||
? `Running: ${[...runningJobs.keys()].join(', ')}`
|
||||
: 'No jobs running';
|
||||
const state = cycleState.waitingForCooldown
|
||||
? 'cooldown'
|
||||
: `group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length} (${currentGroup})`;
|
||||
log(`Heartbeat: Cycle ${cycleState.cycleNumber + 1}, ${state}. ${jobs}`);
|
||||
fs.writeFileSync(path.join(LOGS_DIR, 'scheduler.heartbeat'), new Date().toISOString());
|
||||
}, { timezone: 'UTC' });
|
||||
log('Registered cron job: heartbeat (hourly)');
|
||||
```
|
||||
|
||||
`fs` and `path` are already imported in `scheduler.ts`. `LOGS_DIR` is already defined as `'/app/logs'`.
|
||||
|
||||
- [ ] **Step 2: Verify the script compiles**
|
||||
|
||||
```bash
|
||||
cd /home/albert/Documents/ScraperControl
|
||||
npx tsc --noEmit
|
||||
```
|
||||
|
||||
Expected: no errors.
|
||||
|
||||
- [ ] **Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add scripts/scheduler.ts
|
||||
git commit -m "fix: write heartbeat file for Docker healthcheck"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Fix scheduler healthcheck in docker-compose.yml
|
||||
|
||||
**Files:**
|
||||
- Modify: `docker-compose.yml`
|
||||
|
||||
- [ ] **Step 1: Replace the scheduler healthcheck**
|
||||
|
||||
Find this block in `docker-compose.yml` (around line 275):
|
||||
|
||||
```yaml
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pgrep -f scheduler.ts || exit 1"]
|
||||
interval: 60s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
```
|
||||
|
||||
Replace with:
|
||||
|
||||
```yaml
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "find /app/logs/scheduler.heartbeat -mmin -120 2>/dev/null | grep -q . || exit 1"]
|
||||
interval: 90s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 90s
|
||||
```
|
||||
|
||||
The `find ... -mmin -120` check passes if the file exists and was modified within the last 120 minutes (2 hours). The `start_period: 90s` gives the scheduler time to reach its first hourly cron tick before Docker starts evaluating health.
|
||||
|
||||
- [ ] **Step 2: Commit**
|
||||
|
||||
```bash
|
||||
git add docker-compose.yml
|
||||
git commit -m "fix: replace pgrep healthcheck with heartbeat file check"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Deploy and verify
|
||||
|
||||
- [ ] **Step 1: Sync dev directory to Docker deployment**
|
||||
|
||||
```bash
|
||||
cd /home/albert/Documents/ScraperControl
|
||||
bash scripts/deploy-local.sh
|
||||
```
|
||||
|
||||
Expected: rsync output showing the three changed files transferred to `/opt/docker/scraper-control/`.
|
||||
|
||||
- [ ] **Step 2: Restart the two affected containers**
|
||||
|
||||
```bash
|
||||
docker compose -f /opt/docker/scraper-control/docker-compose.yml restart freesearch-enrichment scheduler
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Verify freesearch-enrichment is stable**
|
||||
|
||||
```bash
|
||||
docker logs scraper-control-freesearch-enrichment-1 --tail 30 -f
|
||||
```
|
||||
|
||||
Expected: logs showing "Waiting for FreeSearch to be reachable..." with retry messages if FreeSearch is still down, OR "FreeSearch health check: OK" and normal enrichment if FreeSearch is up. Container should NOT exit. Wait 2 minutes to confirm no restart.
|
||||
|
||||
- [ ] **Step 4: Confirm stale jobs were cleaned up**
|
||||
|
||||
```bash
|
||||
docker exec scraper-control-db-1 psql -U postgres -d nearestmass \
|
||||
-c "SELECT type, status, started_at, completed_at, error FROM background_jobs WHERE type = 'freesearch-enrichment' ORDER BY started_at DESC LIMIT 5;"
|
||||
```
|
||||
|
||||
Expected: the two previously-stuck `running` jobs from Mar 22 and Mar 26 now show `status = 'failed'` with `error = 'Container restarted'`.
|
||||
|
||||
- [ ] **Step 5: Verify scheduler heartbeat file is written**
|
||||
|
||||
Check if the file already exists from before (it won't — it's new). Wait for next hourly cron tick, or check after 60 minutes:
|
||||
|
||||
```bash
|
||||
docker exec scraper-control-scheduler-1 cat /app/logs/scheduler.heartbeat
|
||||
```
|
||||
|
||||
Expected: an ISO timestamp, e.g. `2026-03-28T14:00:00.000Z`
|
||||
|
||||
- [ ] **Step 6: Verify scheduler becomes healthy**
|
||||
|
||||
```bash
|
||||
docker ps --format "table {{.Names}}\t{{.Status}}" | grep scheduler
|
||||
```
|
||||
|
||||
Expected: `scraper-control-scheduler-1 Up X hours (healthy)` — but only after the first heartbeat fires AND Docker's `start_period` (90s) passes. If the next cron tick hasn't happened yet, `status` will remain `starting` or `unhealthy` until it does.
|
||||
|
||||
To force an immediate test without waiting for the cron:
|
||||
|
||||
```bash
|
||||
docker exec scraper-control-scheduler-1 bash -c \
|
||||
"date -u +%Y-%m-%dT%H:%M:%S.000Z > /app/logs/scheduler.heartbeat && echo 'written'"
|
||||
docker exec scraper-control-scheduler-1 \
|
||||
find /app/logs/scheduler.heartbeat -mmin -120 2>/dev/null | grep -q . && echo "PASS" || echo "FAIL"
|
||||
```
|
||||
|
||||
Expected: `written` then `PASS`.
|
||||
103
docs/superpowers/specs/2026-03-28-freesearch-stability-design.md
Normal file
103
docs/superpowers/specs/2026-03-28-freesearch-stability-design.md
Normal file
@@ -0,0 +1,103 @@
|
||||
# FreeSearch Stability & Scheduler Healthcheck Fix
|
||||
|
||||
**Date:** 2026-03-28
|
||||
**Status:** Approved
|
||||
**Scope:** `scripts/enrich-with-freesearch.ts`, `scripts/scheduler.ts`, `docker-compose.yml`
|
||||
|
||||
---
|
||||
|
||||
## Problem Summary
|
||||
|
||||
Three related infrastructure reliability issues identified during health check:
|
||||
|
||||
1. **FreeSearch crash loop** — `freesearch-enrichment` container restarts every ~60s because startup health check calls `process.exit(1)` when FreeSearch API is unreachable. The circuit breaker (which handles mid-run outages) lives inside `runContinuous()` and is never reached.
|
||||
|
||||
2. **Stale running jobs** — Each container restart creates a new `freesearch-enrichment` DB job without cleaning up the previous `running` one. Two jobs from Mar 22 and Mar 26 are permanently stuck as `running`.
|
||||
|
||||
3. **Scheduler healthcheck failing** — `node:20-bookworm-slim` does not include `procps`/`pgrep`. The healthcheck command `pgrep -f scheduler.ts` exits 1 silently → scheduler shows as `unhealthy` despite working correctly.
|
||||
|
||||
---
|
||||
|
||||
## Fix 1: FreeSearch Startup Resilience
|
||||
|
||||
### Change
|
||||
|
||||
Replace the `process.exit(1)` startup health check in `main()` with a `waitForFreeSearch()` function.
|
||||
|
||||
### Behavior
|
||||
|
||||
- Polls `GET /api/health` with exponential backoff: 30s → 60s → 120s → 240s → cap at 300s (5 min)
|
||||
- Waits indefinitely — container stays alive until FreeSearch comes back
|
||||
- Logs each attempt: `"FreeSearch not reachable, retrying in 120s..."`
|
||||
- Logs recovery: `"FreeSearch is back, continuing..."`
|
||||
- Proceeds to job setup and `runContinuous()` once health check passes
|
||||
|
||||
### Stale job cleanup (same function)
|
||||
|
||||
Before creating a new DB job in `main()`, run a cleanup:
|
||||
|
||||
```typescript
|
||||
await prisma.backgroundJob.updateMany({
|
||||
where: { type: 'freesearch-enrichment', status: 'running' },
|
||||
data: { status: 'failed', error: 'Container restarted', completedAt: new Date() },
|
||||
});
|
||||
```
|
||||
|
||||
This fixes the two existing stuck jobs and prevents the pattern from recurring on future restarts.
|
||||
|
||||
### Files changed
|
||||
|
||||
- `scripts/enrich-with-freesearch.ts`: ~25 lines
|
||||
|
||||
---
|
||||
|
||||
## Fix 2: Scheduler Healthcheck
|
||||
|
||||
### Change
|
||||
|
||||
Replace `pgrep`-based healthcheck with a heartbeat file approach.
|
||||
|
||||
**In `scheduler.ts`:** Add `writeHeartbeat()` call inside the existing hourly cron handler. Writes current ISO timestamp to `/app/logs/scheduler.heartbeat`.
|
||||
|
||||
**In `docker-compose.yml`:** Replace healthcheck:
|
||||
|
||||
```yaml
|
||||
# Before
|
||||
test: ["CMD-SHELL", "pgrep -f scheduler.ts || exit 1"]
|
||||
interval: 60s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
# After
|
||||
test: ["CMD-SHELL", "find /app/logs/scheduler.heartbeat -mmin -120 2>/dev/null | grep -q . || exit 1"]
|
||||
interval: 90s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 90s
|
||||
```
|
||||
|
||||
The `./logs` volume is already mounted. `start_period: 90s` avoids false alarms before the first cron tick.
|
||||
|
||||
### Files changed
|
||||
|
||||
- `scripts/scheduler.ts`: ~5 lines
|
||||
- `docker-compose.yml`: 4 lines
|
||||
|
||||
---
|
||||
|
||||
## Fix 3: Deploy
|
||||
|
||||
```bash
|
||||
bash scripts/deploy-local.sh
|
||||
docker compose -f /opt/docker/scraper-control/docker-compose.yml restart freesearch-enrichment scheduler
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- `freesearch-enrichment` container stays running even when FreeSearch is down, resumes enrichment when it comes back
|
||||
- No new stale `running` freesearch-enrichment jobs after container restarts
|
||||
- `scheduler` container shows as `healthy` in `docker ps`
|
||||
- No behavioral changes to enrichment logic itself
|
||||
@@ -22,6 +22,7 @@
|
||||
"scrape:diocese": "tsx scripts/scrape-diocese-directory.ts",
|
||||
"setup:diocese": "tsx scripts/setup-diocese.ts",
|
||||
"import:gcatholic": "tsx scripts/import-gcatholic.ts",
|
||||
"import:buscarmisas-network": "tsx scripts/import-buscarmisas-network.ts",
|
||||
"import:orarimesse": "tsx scripts/import-orarimesse.ts",
|
||||
"import:mass-schedules-ph": "tsx scripts/import-mass-schedules-ph.ts",
|
||||
"import:philmass": "tsx scripts/import-philmass.ts",
|
||||
@@ -29,7 +30,7 @@
|
||||
"import:msze-info": "tsx scripts/import-msze-info.ts",
|
||||
"import:weekdaymasses": "tsx scripts/import-weekdaymasses.ts",
|
||||
"import:masstimes-api": "tsx scripts/import-masstimes-api.ts",
|
||||
"import:discovermass": "tsx scripts/import-discovermass.ts",
|
||||
"dedup:geo": "tsx scripts/find-geo-duplicates.ts",
|
||||
"postinstall": "prisma generate"
|
||||
},
|
||||
"dependencies": {
|
||||
|
||||
25
scripts/deploy-local.sh
Executable file
25
scripts/deploy-local.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
DEV_PATH="$HOME/Documents/ScraperControl"
|
||||
DOCKER_PATH="/opt/docker/scraper-control"
|
||||
|
||||
echo "Syncing dev → Docker deployment..."
|
||||
|
||||
rsync -avz \
|
||||
--exclude node_modules \
|
||||
--exclude .next \
|
||||
--exclude '.env*' \
|
||||
--exclude .git \
|
||||
--exclude .claude \
|
||||
--exclude .playwright-mcp \
|
||||
"$DEV_PATH/" "$DOCKER_PATH/"
|
||||
|
||||
echo "Restarting Docker services..."
|
||||
cd "$DOCKER_PATH"
|
||||
docker compose build app scheduler freesearch-enrichment
|
||||
docker compose up -d app scheduler freesearch-enrichment
|
||||
docker compose ps
|
||||
docker compose logs --tail 5 scheduler
|
||||
|
||||
echo "Deploy complete!"
|
||||
1371
scripts/enrich-with-freesearch.ts
Normal file
1371
scripts/enrich-with-freesearch.ts
Normal file
File diff suppressed because it is too large
Load Diff
@@ -30,6 +30,12 @@ import { findDuplicateChurch } from '../src/lib/church-matcher';
|
||||
import type { ExistingChurch } from '../src/lib/church-matcher';
|
||||
import { getDayNamesForCountry, buildDayPatterns } from '../src/scrapers/i18n/day-names';
|
||||
|
||||
const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
|
||||
console.log(`Connecting to: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
|
||||
const pool = new Pool({ connectionString: dbUrl, ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined });
|
||||
const adapter = new PrismaPg(pool);
|
||||
const prisma = new PrismaClient({ adapter });
|
||||
|
||||
// ─── Site Config ─────────────────────────────────────────────────────────────
|
||||
|
||||
interface SiteConfig {
|
||||
@@ -218,6 +224,137 @@ function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
// ─── DB Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadExistingChurches(country: string): Promise<ExistingChurch[]> {
|
||||
console.log(`Loading existing ${country} churches from DB...`);
|
||||
const churches = await prisma.church.findMany({
|
||||
where: { country },
|
||||
select: {
|
||||
id: true, name: true, latitude: true, longitude: true,
|
||||
osmId: true, baiduId: true, masstimesId: true,
|
||||
orarimesseId: true, massSchedulesPhId: true, philmassId: true,
|
||||
horariosMisasId: true, mszeInfoId: true, weekdayMassesId: true,
|
||||
messesInfoId: true, bohosluzbyId: true, miserendId: true,
|
||||
kerknetId: true, gottesdienstzeitenId: true, discovermassId: true,
|
||||
buscarmisasNetworkId: true,
|
||||
source: true, website: true, phone: true, address: true, country: true,
|
||||
},
|
||||
});
|
||||
console.log(` Loaded ${churches.length} existing ${country} churches`);
|
||||
return churches as ExistingChurch[];
|
||||
}
|
||||
|
||||
// ─── Church Processing ────────────────────────────────────────────────────────
|
||||
|
||||
async function processChurch(
|
||||
url: string,
|
||||
domain: string,
|
||||
config: SiteConfig,
|
||||
existingChurches: ExistingChurch[],
|
||||
args: CLIArgs,
|
||||
stats: ImportStats,
|
||||
): Promise<void> {
|
||||
stats.total++;
|
||||
try {
|
||||
const html = await fetchWithRetry(url);
|
||||
const parsed = parseChurchPage(html, domain, url, config);
|
||||
if (!parsed) {
|
||||
console.log(` [skip] No name/coords: ${url}`);
|
||||
stats.skipped++;
|
||||
return;
|
||||
}
|
||||
|
||||
const masses = parseMassSchedule(html, config.country);
|
||||
|
||||
if (args.dryRun) {
|
||||
console.log(` [dry-run] ${parsed.name} — ${masses.length} masses`);
|
||||
return;
|
||||
}
|
||||
|
||||
const candidate = {
|
||||
name: parsed.name,
|
||||
lat: parsed.lat,
|
||||
lng: parsed.lng,
|
||||
buscarmisasNetworkId: parsed.externalId,
|
||||
};
|
||||
const duplicate = findDuplicateChurch(candidate, existingChurches);
|
||||
|
||||
if (duplicate) {
|
||||
const updateData: Record<string, unknown> = { buscarmisasNetworkId: parsed.externalId };
|
||||
if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
|
||||
if (parsed.lat !== 0 && duplicate.latitude === 0) {
|
||||
updateData.latitude = parsed.lat;
|
||||
updateData.longitude = parsed.lng;
|
||||
}
|
||||
|
||||
await prisma.$transaction(async (tx) => {
|
||||
await tx.church.update({ where: { id: duplicate.id }, data: updateData });
|
||||
if (masses.length > 0) {
|
||||
await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
|
||||
await tx.massSchedule.createMany({
|
||||
data: masses.map(m => ({ churchId: duplicate.id, dayOfWeek: m.dayOfWeek, time: m.time, language: config.language === 'pt' ? 'Portuguese' : 'Spanish', notes: null })),
|
||||
});
|
||||
}
|
||||
await tx.church.update({ where: { id: duplicate.id }, data: { lastScrapedAt: new Date() } });
|
||||
});
|
||||
duplicate.buscarmisasNetworkId = parsed.externalId;
|
||||
stats.updated++;
|
||||
} else {
|
||||
const church = await prisma.church.create({
|
||||
data: {
|
||||
name: parsed.name,
|
||||
address: parsed.address,
|
||||
city: parsed.city,
|
||||
state: parsed.state,
|
||||
country: parsed.country,
|
||||
phone: parsed.phone,
|
||||
latitude: parsed.lat,
|
||||
longitude: parsed.lng,
|
||||
buscarmisasNetworkId: parsed.externalId,
|
||||
source: 'buscarmisas-network',
|
||||
hasWebsite: false,
|
||||
},
|
||||
});
|
||||
|
||||
existingChurches.push({
|
||||
id: church.id, name: parsed.name, latitude: parsed.lat, longitude: parsed.lng,
|
||||
osmId: null, baiduId: null, masstimesId: null, orarimesseId: null,
|
||||
massSchedulesPhId: null, philmassId: null, horariosMisasId: null,
|
||||
mszeInfoId: null, weekdayMassesId: null, messesInfoId: null,
|
||||
bohosluzbyId: null, miserendId: null, kerknetId: null,
|
||||
gottesdienstzeitenId: null, discovermassId: null,
|
||||
buscarmisasNetworkId: parsed.externalId,
|
||||
source: 'buscarmisas-network', website: null, phone: parsed.phone,
|
||||
address: parsed.address, country: parsed.country,
|
||||
});
|
||||
|
||||
if (masses.length > 0) {
|
||||
await prisma.massSchedule.createMany({
|
||||
data: masses.map(m => ({
|
||||
churchId: church.id,
|
||||
dayOfWeek: m.dayOfWeek,
|
||||
time: m.time,
|
||||
language: config.language === 'pt' ? 'Portuguese' : 'Spanish',
|
||||
notes: null,
|
||||
})),
|
||||
});
|
||||
await prisma.church.update({ where: { id: church.id }, data: { lastScrapedAt: new Date() } });
|
||||
}
|
||||
stats.created++;
|
||||
}
|
||||
|
||||
stats.massSchedulesCreated += masses.length;
|
||||
console.log(
|
||||
` [${duplicate ? 'update' : 'create'}] ${parsed.name} — ${masses.length} masses — ` +
|
||||
`${stats.total} total (${stats.created}↑ ${stats.updated}↻ ${stats.errors}✗)`
|
||||
);
|
||||
} catch (err) {
|
||||
stats.errors++;
|
||||
console.error(` [error] ${url}: ${err instanceof Error ? err.message : err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Sitemap Discovery ────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
@@ -257,3 +394,141 @@ export async function getChurchUrls(domain: string, config: SiteConfig): Promise
|
||||
console.log(` Total church URLs: ${unique.length}`);
|
||||
return unique;
|
||||
}
|
||||
|
||||
// ─── CLI ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
function parseCLIArgs(): CLIArgs {
|
||||
const argv = process.argv.slice(2);
|
||||
const result: CLIArgs = { domain: null, all: false, dryRun: false, resumeFrom: 0, limit: null, jobId: null };
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
switch (argv[i]) {
|
||||
case '--domain': result.domain = argv[++i]; break;
|
||||
case '--all': result.all = true; break;
|
||||
case '--dry-run': result.dryRun = true; break;
|
||||
case '--resume-from': result.resumeFrom = parseInt(argv[++i], 10); break;
|
||||
case '--limit': result.limit = parseInt(argv[++i], 10); break;
|
||||
case '--job-id': result.jobId = argv[++i]; break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function validateArgs(args: CLIArgs): void {
|
||||
if (!args.domain && !args.all) {
|
||||
console.error('Usage:');
|
||||
console.error(' npx tsx scripts/import-buscarmisas-network.ts --domain <domain>');
|
||||
console.error(' npx tsx scripts/import-buscarmisas-network.ts --all');
|
||||
console.error('\nValid domains:', Object.keys(NETWORK_SITES).join(', '));
|
||||
process.exit(1);
|
||||
}
|
||||
if (args.domain && !NETWORK_SITES[args.domain]) {
|
||||
console.error(`Unknown domain: ${args.domain}`);
|
||||
console.error('Valid domains:', Object.keys(NETWORK_SITES).join(', '));
|
||||
process.exit(1);
|
||||
}
|
||||
if (args.all && args.resumeFrom > 0) {
|
||||
console.error('--resume-from cannot be used with --all. Use --domain to resume a specific site.');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
async function runDomain(domain: string, config: SiteConfig, args: CLIArgs): Promise<ImportStats> {
|
||||
const stats: ImportStats = { total: 0, created: 0, updated: 0, skipped: 0, errors: 0, massSchedulesCreated: 0 };
|
||||
|
||||
const allUrls = await getChurchUrls(domain, config);
|
||||
const existingChurches = await loadExistingChurches(config.country);
|
||||
|
||||
// Build set of already-imported IDs for fast skip
|
||||
const importedIds = new Set(
|
||||
existingChurches.filter(c => c.buscarmisasNetworkId).map(c => c.buscarmisasNetworkId!)
|
||||
);
|
||||
|
||||
let candidateUrls = allUrls.slice(args.resumeFrom).filter(url => {
|
||||
const externalId = buildExternalId(domain, url);
|
||||
return !importedIds.has(externalId);
|
||||
});
|
||||
if (args.limit !== null) candidateUrls = candidateUrls.slice(0, args.limit);
|
||||
|
||||
console.log(`\n${domain}: ${allUrls.length} total | ${importedIds.size} already imported | ${candidateUrls.length} to process\n`);
|
||||
|
||||
for (let i = 0; i < candidateUrls.length; i++) {
|
||||
const url = candidateUrls[i];
|
||||
console.log(`[${i + 1}/${candidateUrls.length}] ${url}`);
|
||||
await processChurch(url, domain, config, existingChurches, args, stats);
|
||||
if (i < candidateUrls.length - 1) await sleep(REQUEST_DELAY_MS);
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
// ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
const args = parseCLIArgs();
|
||||
validateArgs(args);
|
||||
|
||||
if (args.jobId) {
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: { status: 'running', startedAt: new Date() },
|
||||
});
|
||||
} catch { /* job may not exist yet */ }
|
||||
}
|
||||
|
||||
const domainsToRun: [string, SiteConfig][] = args.all
|
||||
? Object.entries(NETWORK_SITES)
|
||||
: [[args.domain!, NETWORK_SITES[args.domain!]]];
|
||||
|
||||
const totalStats: ImportStats = { total: 0, created: 0, updated: 0, skipped: 0, errors: 0, massSchedulesCreated: 0 };
|
||||
|
||||
try {
|
||||
for (let d = 0; d < domainsToRun.length; d++) {
|
||||
const [domain, config] = domainsToRun[d];
|
||||
console.log(`\n${'─'.repeat(60)}`);
|
||||
console.log(`Domain ${d + 1}/${domainsToRun.length}: ${domain} (${config.country})`);
|
||||
console.log('─'.repeat(60));
|
||||
const stats = await runDomain(domain, config, args);
|
||||
totalStats.total += stats.total;
|
||||
totalStats.created += stats.created;
|
||||
totalStats.updated += stats.updated;
|
||||
totalStats.skipped += stats.skipped;
|
||||
totalStats.errors += stats.errors;
|
||||
totalStats.massSchedulesCreated += stats.massSchedulesCreated;
|
||||
if (d < domainsToRun.length - 1) await sleep(DOMAIN_DELAY_MS);
|
||||
}
|
||||
} finally {
|
||||
console.log('\n─── Import Complete ───────────────────────────────────────');
|
||||
console.log(`Total processed: ${totalStats.total}`);
|
||||
console.log(`Created: ${totalStats.created}`);
|
||||
console.log(`Updated: ${totalStats.updated}`);
|
||||
console.log(`Skipped: ${totalStats.skipped}`);
|
||||
console.log(`Errors: ${totalStats.errors}`);
|
||||
console.log(`Mass schedules: ${totalStats.massSchedulesCreated}`);
|
||||
|
||||
if (args.jobId) {
|
||||
const status = totalStats.errors > totalStats.total * 0.1 ? 'failed' : 'completed';
|
||||
try {
|
||||
await prisma.backgroundJob.update({
|
||||
where: { id: args.jobId },
|
||||
data: {
|
||||
status,
|
||||
completedAt: new Date(),
|
||||
processed: totalStats.total,
|
||||
succeeded: totalStats.created + totalStats.updated,
|
||||
failed: totalStats.errors,
|
||||
itemsFound: totalStats.massSchedulesCreated,
|
||||
},
|
||||
});
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('Fatal error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
@@ -57,7 +57,6 @@ const PIPELINE_GROUPS: PipelineGroup[] = [
|
||||
{ name: 'kerknet-import', type: 'kerknet-import', config: {} },
|
||||
{ name: 'gottesdienstzeiten-import', type: 'gottesdienstzeiten-import', config: {} },
|
||||
{ name: 'masstimes-api-import', type: 'masstimes-api-import', config: {} },
|
||||
{ name: 'discovermass-import', type: 'discovermass-import', config: {} },
|
||||
],
|
||||
},
|
||||
{
|
||||
@@ -237,11 +236,6 @@ function getJobCommand(type: string, language?: string | null, config?: Record<s
|
||||
if (config?.region) args.splice(2, 1, '--region', String(config.region)); // replace --all with --region
|
||||
return { command: 'npx', args };
|
||||
}
|
||||
case 'discovermass-import': {
|
||||
const args = ['tsx', 'scripts/import-discovermass.ts', '--all'];
|
||||
if (config?.resumeFrom) args.push('--resume-from', String(config.resumeFrom));
|
||||
return { command: 'npx', args };
|
||||
}
|
||||
default:
|
||||
throw new Error(`Unknown job type: ${type}`);
|
||||
}
|
||||
@@ -737,6 +731,7 @@ async function main(): Promise<void> {
|
||||
|
||||
validateEnvironment();
|
||||
ensureLogsDir();
|
||||
fs.writeFileSync(path.join(LOGS_DIR, 'scheduler.heartbeat'), new Date().toISOString());
|
||||
|
||||
// Crash recovery: mark orphaned jobs as failed
|
||||
await recoverFromCrash();
|
||||
@@ -749,7 +744,7 @@ async function main(): Promise<void> {
|
||||
cron.schedule('0 */6 * * *', () => cleanStaleJobs(), { timezone: 'UTC' });
|
||||
log('Registered cron job: stale-job-cleanup (every 6h)');
|
||||
|
||||
// Heartbeat every hour — logs cycle state
|
||||
// Heartbeat every hour — logs cycle state and writes heartbeat file for Docker healthcheck
|
||||
cron.schedule('0 * * * *', () => {
|
||||
const currentGroup = cycleState.currentGroupIndex < PIPELINE_GROUPS.length
|
||||
? PIPELINE_GROUPS[cycleState.currentGroupIndex].name
|
||||
@@ -761,6 +756,7 @@ async function main(): Promise<void> {
|
||||
? 'cooldown'
|
||||
: `group ${cycleState.currentGroupIndex + 1}/${PIPELINE_GROUPS.length} (${currentGroup})`;
|
||||
log(`Heartbeat: Cycle ${cycleState.cycleNumber + 1}, ${state}. ${jobs}`);
|
||||
fs.writeFileSync(path.join(LOGS_DIR, 'scheduler.heartbeat'), new Date().toISOString());
|
||||
}, { timezone: 'UTC' });
|
||||
log('Registered cron job: heartbeat (hourly)');
|
||||
|
||||
|
||||
Reference in New Issue
Block a user