Files
ScraperControl/scripts/enrich-with-google-places.ts
Albert 2c51513851 chore: sync with Gitea master and restore local-only files
Reset local main to gitea/master (new source of truth) and restored
local-only files: web scrapers, admin dashboard, ChromaDB integration,
debug scripts, and utility libraries that aren't tracked in Gitea.

Gitea master adds: discovermass, buscarmisas-network, hk-parishes,
bohosluzby, kerknet, gottesdienstzeiten, miserend importers,
ClaimRequest model, forward geocoding, heartbeat healthcheck.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 19:11:22 -04:00

409 lines
12 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Enrich OSM churches with Google Places data (website, phone, email)
*
* Usage:
* npx tsx scripts/enrich-with-google-places.ts --limit 10 --dry-run
* npx tsx scripts/enrich-with-google-places.ts --country BR --limit 100
* npx tsx scripts/enrich-with-google-places.ts --all
*
* Rate Limiting:
* - Free tier: $200/month credit
* - Text Search: ~$17 per 1000 requests
* - $200 / $17 = ~11,764 requests per month
* - ~390 churches per day to stay within free tier
* - Script uses 2-second delay between requests (max 1,800/hour)
*/
// Load .env for database connection
import dotenv from 'dotenv';
import path from 'path';
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
// Use DATABASE_URL from .env (works for both local dev and NAS/production)
import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import axios from 'axios';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const GOOGLE_PLACES_API_KEY = process.env.GOOGLE_PLACES_API_KEY;
const PLACES_API_URL = 'https://places.googleapis.com/v1/places:searchText';
const RATE_LIMIT_MS = 2000; // 2 seconds between requests
// --- Job Tracking ---
async function createOrResumeJob(args: string[]): Promise<string | null> {
const jobIdIndex = args.indexOf('--job-id');
if (jobIdIndex !== -1) {
const jobId = args[jobIdIndex + 1];
await prisma.backgroundJob.update({
where: { id: jobId },
data: { status: 'running', startedAt: new Date() },
});
return jobId;
}
return null;
}
async function createNewJob(config: Record<string, unknown>): Promise<string> {
const job = await prisma.backgroundJob.create({
data: {
type: 'google-enrichment',
status: 'running',
startedAt: new Date(),
config: config as any,
},
});
return job.id;
}
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
await prisma.backgroundJob.update({
where: { id: jobId },
data: { processed, succeeded, failed, itemsFound, totalItems },
});
}
async function checkJobStopping(jobId: string): Promise<boolean> {
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
return job?.status === 'stopping';
}
async function completeJob(jobId: string, error?: string): Promise<void> {
await prisma.backgroundJob.update({
where: { id: jobId },
data: {
status: error ? 'failed' : 'completed',
error,
completedAt: new Date(),
},
});
}
/**
* Country priority order — largest OSM church counts first, since those
* have the most un-enriched churches. Covers all countries from the
* CATHOLIC_COUNTRIES lists in import-osm-churches.ts.
*/
const COUNTRY_PRIORITY = [
// Top tier: 5000+ OSM churches
'FR', 'IT', 'ES', 'DE', 'PL', 'BR',
// High tier: 1000-5000
'PT', 'AT', 'BE', 'CZ', 'PH', 'HU', 'US', 'MX', 'HR', 'GB',
'CR', 'SK', 'EC', 'CH', 'AR', 'CA', 'CO', 'NL', 'IE', 'IN',
'SI', 'AU',
// Medium tier: 100-1000
'PE', 'RO', 'KR', 'CL', 'ID', 'LT', 'BO', 'VN', 'BA', 'BY',
'UA', 'VE', 'HN', 'UG', 'CD', 'GT', 'CU', 'SV', 'NI', 'PA',
'DO', 'CN', 'JP', 'LV', 'RS', 'TZ', 'KE', 'AL', 'RU',
// Lower tier: remaining countries
'LU', 'MT', 'NZ', 'PG', 'FJ', 'NC', 'PF', 'UY', 'PY', 'HT',
'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'AO', 'NG',
'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL',
'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO',
'TL', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'LK', 'BD', 'PK',
'LB', 'IL', 'PS', 'JO', 'SY', 'IQ',
'GF', 'SR', 'GY', 'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC',
'AG', 'DM', 'KN', 'MC', 'SM', 'VA', 'LI', 'AD',
'RS', 'MK', 'EE', 'GE', 'AM',
'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ',
'DJ', 'GM', 'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG',
'MN', 'BN', 'MV', 'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV',
'FM', 'MH', 'PW',
];
interface GooglePlacesResult {
found: boolean;
website?: string;
phone?: string;
placeId?: string;
}
interface EnrichmentStats {
processed: number;
enriched: number;
notFound: number;
errors: number;
websitesAdded: number;
phonesAdded: number;
}
async function searchGooglePlaces(
name: string,
city: string | null,
state: string | null,
latitude: number,
longitude: number
): Promise<GooglePlacesResult> {
if (!GOOGLE_PLACES_API_KEY) {
throw new Error('GOOGLE_PLACES_API_KEY not set in environment');
}
// Build search query
const location = [city, state].filter(Boolean).join(', ');
const textQuery = `${name} ${location}`.trim();
try {
const response = await axios.post(
PLACES_API_URL,
{
textQuery,
locationBias: {
circle: {
center: {
latitude,
longitude,
},
radius: 500, // 500 meters
},
},
},
{
headers: {
'Content-Type': 'application/json',
'X-Goog-Api-Key': GOOGLE_PLACES_API_KEY,
'X-Goog-FieldMask': 'places.id,places.displayName,places.websiteUri,places.nationalPhoneNumber',
},
}
);
if (response.data.places && response.data.places.length > 0) {
const place = response.data.places[0]; // Take first result
return {
found: true,
website: place.websiteUri || undefined,
phone: place.nationalPhoneNumber || undefined,
placeId: place.id || undefined,
};
}
return { found: false };
} catch (error: any) {
if (error.response?.status === 429) {
console.error('Rate limited by Google Places API');
throw new Error('RATE_LIMITED');
}
throw error;
}
}
async function enrichChurches(
countryCode?: string,
limit?: number,
dryRun: boolean = false,
jobId?: string | null
): Promise<EnrichmentStats> {
const stats: EnrichmentStats = {
processed: 0,
enriched: 0,
notFound: 0,
errors: 0,
websitesAdded: 0,
phonesAdded: 0,
};
let churches;
if (countryCode) {
// Manual override: process specific country
console.log(`Manual mode: Processing country ${countryCode}`);
churches = await prisma.church.findMany({
where: {
source: 'osm',
googleSearchedAt: null,
country: countryCode,
},
take: limit,
orderBy: { createdAt: 'asc' },
});
} else {
// Priority mode: sequential through countries (exhaust each before moving on)
console.log('Priority mode: Processing countries sequentially');
console.log(`Top priority countries: ${COUNTRY_PRIORITY.slice(0, 10).join(', ')}...\n`);
churches = [];
const targetTotal = limit || 390;
for (const country of COUNTRY_PRIORITY) {
if (churches.length >= targetTotal) break;
const remaining = targetTotal - churches.length;
const batch = await prisma.church.findMany({
where: {
source: 'osm',
googleSearchedAt: null,
country,
},
take: remaining,
orderBy: { createdAt: 'asc' },
});
if (batch.length > 0) {
churches.push(...batch);
console.log(` Queued ${batch.length} churches from ${country}`);
}
}
}
console.log(`\nFound ${churches.length} churches to enrich`);
console.log('');
for (const church of churches) {
stats.processed++;
try {
console.log(`[${stats.processed}/${churches.length}] ${church.name} (${church.city}, ${church.state})`);
const result = await searchGooglePlaces(
church.name,
church.city,
church.state,
church.latitude,
church.longitude
);
if (result.found) {
console.log(' ✓ Found on Google Places');
if (result.website) {
console.log(` Website: ${result.website}`);
stats.websitesAdded++;
}
if (result.phone) {
console.log(` Phone: ${result.phone}`);
stats.phonesAdded++;
}
if (!dryRun) {
await prisma.church.update({
where: { id: church.id },
data: {
website: result.website || church.website,
phone: result.phone || church.phone,
googlePlaceId: result.placeId || church.googlePlaceId,
hasWebsite: !!(result.website || church.website),
googleSearchedAt: new Date(),
},
});
if (result.website || result.phone) {
stats.enriched++;
}
}
} else {
console.log(' ✗ Not found on Google Places');
stats.notFound++;
// Mark as attempted so we don't re-query this church
if (!dryRun) {
await prisma.church.update({
where: { id: church.id },
data: { googleSearchedAt: new Date() },
});
}
}
// Rate limiting
await new Promise((resolve) => setTimeout(resolve, RATE_LIMIT_MS));
} catch (error: any) {
stats.errors++;
if (error.message === 'RATE_LIMITED') {
console.error(' ⚠ Rate limited, stopping enrichment');
break;
}
console.error(` ✗ Error: ${error.message}`);
}
// Job tracking: update progress every 10 items and check for stop
if (jobId && stats.processed % 10 === 0) {
await updateJobProgress(jobId, stats.processed, stats.enriched, stats.errors, stats.enriched, churches.length);
const stopping = await checkJobStopping(jobId);
if (stopping) {
console.log('\nJob stop requested via admin dashboard.');
break;
}
}
// Progress update every 50 churches
if (stats.processed % 50 === 0) {
console.log('');
console.log(`Progress: ${stats.processed}/${churches.length} processed`);
console.log(` Enriched: ${stats.enriched}, Not found: ${stats.notFound}, Errors: ${stats.errors}`);
console.log('');
}
}
// Final job update
if (jobId) {
await updateJobProgress(jobId, stats.processed, stats.enriched, stats.errors, stats.enriched, churches.length);
}
return stats;
}
async function main() {
const args = process.argv.slice(2);
const countryIndex = args.indexOf('--country');
const limitIndex = args.indexOf('--limit');
const dryRun = args.includes('--dry-run');
const all = args.includes('--all');
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
const limit = all ? undefined : limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 10;
if (!GOOGLE_PLACES_API_KEY) {
console.error('Error: GOOGLE_PLACES_API_KEY not set in environment');
console.error('Add it to your .env file');
process.exit(1);
}
console.log('============================================================');
console.log('Google Places Church Enrichment');
console.log('============================================================');
console.log(`Country: ${countryCode || 'All'}`);
console.log(`Limit: ${limit || 'No limit'}`);
console.log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
console.log('============================================================');
console.log('');
// Job tracking
let jobId = await createOrResumeJob(args);
if (!jobId && !dryRun) {
jobId = await createNewJob({ countryCode, limit, dryRun });
}
if (jobId) console.log(`Job ID: ${jobId}\n`);
const stats = await enrichChurches(countryCode, limit, dryRun, jobId);
console.log('');
console.log('============================================================');
console.log('Enrichment Summary');
console.log('============================================================');
console.log(`Churches processed: ${stats.processed}`);
console.log(`Churches enriched: ${stats.enriched}`);
console.log(`Not found on Google: ${stats.notFound}`);
console.log(`Websites added: ${stats.websitesAdded}`);
console.log(`Phone numbers added: ${stats.phonesAdded}`);
console.log(`Errors encountered: ${stats.errors}`);
console.log('============================================================');
// Complete job
if (jobId) {
await completeJob(jobId);
}
await prisma.$disconnect();
await pool.end();
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});