Reset local main to gitea/master (new source of truth) and restored local-only files: web scrapers, admin dashboard, ChromaDB integration, debug scripts, and utility libraries that aren't tracked in Gitea. Gitea master adds: discovermass, buscarmisas-network, hk-parishes, bohosluzby, kerknet, gottesdienstzeiten, miserend importers, ClaimRequest model, forward geocoding, heartbeat healthcheck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
409 lines
12 KiB
TypeScript
409 lines
12 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* Enrich OSM churches with Google Places data (website, phone, email)
|
|
*
|
|
* Usage:
|
|
* npx tsx scripts/enrich-with-google-places.ts --limit 10 --dry-run
|
|
* npx tsx scripts/enrich-with-google-places.ts --country BR --limit 100
|
|
* npx tsx scripts/enrich-with-google-places.ts --all
|
|
*
|
|
* Rate Limiting:
|
|
* - Free tier: $200/month credit
|
|
* - Text Search: ~$17 per 1000 requests
|
|
* - $200 / $17 = ~11,764 requests per month
|
|
* - ~390 churches per day to stay within free tier
|
|
* - Script uses 2-second delay between requests (max 1,800/hour)
|
|
*/
|
|
|
|
// Load .env for database connection
|
|
import dotenv from 'dotenv';
|
|
import path from 'path';
|
|
dotenv.config({ path: path.resolve(process.cwd(), '.env') });
|
|
|
|
// Use DATABASE_URL from .env (works for both local dev and NAS/production)
|
|
|
|
import { Pool } from 'pg';
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
|
import { PrismaClient } from '@prisma/client';
|
|
import axios from 'axios';
|
|
|
|
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
|
|
const adapter = new PrismaPg(pool);
|
|
const prisma = new PrismaClient({ adapter });
|
|
|
|
const GOOGLE_PLACES_API_KEY = process.env.GOOGLE_PLACES_API_KEY;
|
|
const PLACES_API_URL = 'https://places.googleapis.com/v1/places:searchText';
|
|
const RATE_LIMIT_MS = 2000; // 2 seconds between requests
|
|
|
|
// --- Job Tracking ---
|
|
async function createOrResumeJob(args: string[]): Promise<string | null> {
|
|
const jobIdIndex = args.indexOf('--job-id');
|
|
if (jobIdIndex !== -1) {
|
|
const jobId = args[jobIdIndex + 1];
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: { status: 'running', startedAt: new Date() },
|
|
});
|
|
return jobId;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function createNewJob(config: Record<string, unknown>): Promise<string> {
|
|
const job = await prisma.backgroundJob.create({
|
|
data: {
|
|
type: 'google-enrichment',
|
|
status: 'running',
|
|
startedAt: new Date(),
|
|
config: config as any,
|
|
},
|
|
});
|
|
return job.id;
|
|
}
|
|
|
|
async function updateJobProgress(jobId: string, processed: number, succeeded: number, failed: number, itemsFound: number, totalItems: number): Promise<void> {
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: { processed, succeeded, failed, itemsFound, totalItems },
|
|
});
|
|
}
|
|
|
|
async function checkJobStopping(jobId: string): Promise<boolean> {
|
|
const job = await prisma.backgroundJob.findUnique({ where: { id: jobId } });
|
|
return job?.status === 'stopping';
|
|
}
|
|
|
|
async function completeJob(jobId: string, error?: string): Promise<void> {
|
|
await prisma.backgroundJob.update({
|
|
where: { id: jobId },
|
|
data: {
|
|
status: error ? 'failed' : 'completed',
|
|
error,
|
|
completedAt: new Date(),
|
|
},
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Country priority order — largest OSM church counts first, since those
|
|
* have the most un-enriched churches. Covers all countries from the
|
|
* CATHOLIC_COUNTRIES lists in import-osm-churches.ts.
|
|
*/
|
|
const COUNTRY_PRIORITY = [
|
|
// Top tier: 5000+ OSM churches
|
|
'FR', 'IT', 'ES', 'DE', 'PL', 'BR',
|
|
// High tier: 1000-5000
|
|
'PT', 'AT', 'BE', 'CZ', 'PH', 'HU', 'US', 'MX', 'HR', 'GB',
|
|
'CR', 'SK', 'EC', 'CH', 'AR', 'CA', 'CO', 'NL', 'IE', 'IN',
|
|
'SI', 'AU',
|
|
// Medium tier: 100-1000
|
|
'PE', 'RO', 'KR', 'CL', 'ID', 'LT', 'BO', 'VN', 'BA', 'BY',
|
|
'UA', 'VE', 'HN', 'UG', 'CD', 'GT', 'CU', 'SV', 'NI', 'PA',
|
|
'DO', 'CN', 'JP', 'LV', 'RS', 'TZ', 'KE', 'AL', 'RU',
|
|
// Lower tier: remaining countries
|
|
'LU', 'MT', 'NZ', 'PG', 'FJ', 'NC', 'PF', 'UY', 'PY', 'HT',
|
|
'CM', 'RW', 'BI', 'MG', 'MW', 'ZM', 'ZW', 'MZ', 'AO', 'NG',
|
|
'BJ', 'TG', 'CI', 'BF', 'ML', 'NE', 'SN', 'GN', 'LR', 'SL',
|
|
'GH', 'GA', 'CG', 'CF', 'TD', 'SD', 'ET', 'ER', 'SO',
|
|
'TL', 'MY', 'SG', 'TH', 'LA', 'KH', 'MM', 'LK', 'BD', 'PK',
|
|
'LB', 'IL', 'PS', 'JO', 'SY', 'IQ',
|
|
'GF', 'SR', 'GY', 'BS', 'BB', 'JM', 'TT', 'GD', 'LC', 'VC',
|
|
'AG', 'DM', 'KN', 'MC', 'SM', 'VA', 'LI', 'AD',
|
|
'RS', 'MK', 'EE', 'GE', 'AM',
|
|
'NA', 'BW', 'LS', 'SZ', 'MU', 'SC', 'KM', 'CV', 'ST', 'GQ',
|
|
'DJ', 'GM', 'BT', 'NP', 'AF', 'KZ', 'UZ', 'TM', 'TJ', 'KG',
|
|
'MN', 'BN', 'MV', 'WS', 'TO', 'VU', 'SB', 'KI', 'NR', 'TV',
|
|
'FM', 'MH', 'PW',
|
|
];
|
|
|
|
interface GooglePlacesResult {
|
|
found: boolean;
|
|
website?: string;
|
|
phone?: string;
|
|
placeId?: string;
|
|
}
|
|
|
|
interface EnrichmentStats {
|
|
processed: number;
|
|
enriched: number;
|
|
notFound: number;
|
|
errors: number;
|
|
websitesAdded: number;
|
|
phonesAdded: number;
|
|
}
|
|
|
|
async function searchGooglePlaces(
|
|
name: string,
|
|
city: string | null,
|
|
state: string | null,
|
|
latitude: number,
|
|
longitude: number
|
|
): Promise<GooglePlacesResult> {
|
|
if (!GOOGLE_PLACES_API_KEY) {
|
|
throw new Error('GOOGLE_PLACES_API_KEY not set in environment');
|
|
}
|
|
|
|
// Build search query
|
|
const location = [city, state].filter(Boolean).join(', ');
|
|
const textQuery = `${name} ${location}`.trim();
|
|
|
|
try {
|
|
const response = await axios.post(
|
|
PLACES_API_URL,
|
|
{
|
|
textQuery,
|
|
locationBias: {
|
|
circle: {
|
|
center: {
|
|
latitude,
|
|
longitude,
|
|
},
|
|
radius: 500, // 500 meters
|
|
},
|
|
},
|
|
},
|
|
{
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
'X-Goog-Api-Key': GOOGLE_PLACES_API_KEY,
|
|
'X-Goog-FieldMask': 'places.id,places.displayName,places.websiteUri,places.nationalPhoneNumber',
|
|
},
|
|
}
|
|
);
|
|
|
|
if (response.data.places && response.data.places.length > 0) {
|
|
const place = response.data.places[0]; // Take first result
|
|
return {
|
|
found: true,
|
|
website: place.websiteUri || undefined,
|
|
phone: place.nationalPhoneNumber || undefined,
|
|
placeId: place.id || undefined,
|
|
};
|
|
}
|
|
|
|
return { found: false };
|
|
} catch (error: any) {
|
|
if (error.response?.status === 429) {
|
|
console.error('Rate limited by Google Places API');
|
|
throw new Error('RATE_LIMITED');
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
async function enrichChurches(
|
|
countryCode?: string,
|
|
limit?: number,
|
|
dryRun: boolean = false,
|
|
jobId?: string | null
|
|
): Promise<EnrichmentStats> {
|
|
const stats: EnrichmentStats = {
|
|
processed: 0,
|
|
enriched: 0,
|
|
notFound: 0,
|
|
errors: 0,
|
|
websitesAdded: 0,
|
|
phonesAdded: 0,
|
|
};
|
|
|
|
let churches;
|
|
|
|
if (countryCode) {
|
|
// Manual override: process specific country
|
|
console.log(`Manual mode: Processing country ${countryCode}`);
|
|
churches = await prisma.church.findMany({
|
|
where: {
|
|
source: 'osm',
|
|
googleSearchedAt: null,
|
|
country: countryCode,
|
|
},
|
|
take: limit,
|
|
orderBy: { createdAt: 'asc' },
|
|
});
|
|
} else {
|
|
// Priority mode: sequential through countries (exhaust each before moving on)
|
|
console.log('Priority mode: Processing countries sequentially');
|
|
console.log(`Top priority countries: ${COUNTRY_PRIORITY.slice(0, 10).join(', ')}...\n`);
|
|
|
|
churches = [];
|
|
const targetTotal = limit || 390;
|
|
|
|
for (const country of COUNTRY_PRIORITY) {
|
|
if (churches.length >= targetTotal) break;
|
|
|
|
const remaining = targetTotal - churches.length;
|
|
const batch = await prisma.church.findMany({
|
|
where: {
|
|
source: 'osm',
|
|
googleSearchedAt: null,
|
|
country,
|
|
},
|
|
take: remaining,
|
|
orderBy: { createdAt: 'asc' },
|
|
});
|
|
|
|
if (batch.length > 0) {
|
|
churches.push(...batch);
|
|
console.log(` Queued ${batch.length} churches from ${country}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
console.log(`\nFound ${churches.length} churches to enrich`);
|
|
console.log('');
|
|
|
|
for (const church of churches) {
|
|
stats.processed++;
|
|
|
|
try {
|
|
console.log(`[${stats.processed}/${churches.length}] ${church.name} (${church.city}, ${church.state})`);
|
|
|
|
const result = await searchGooglePlaces(
|
|
church.name,
|
|
church.city,
|
|
church.state,
|
|
church.latitude,
|
|
church.longitude
|
|
);
|
|
|
|
if (result.found) {
|
|
console.log(' ✓ Found on Google Places');
|
|
|
|
if (result.website) {
|
|
console.log(` Website: ${result.website}`);
|
|
stats.websitesAdded++;
|
|
}
|
|
|
|
if (result.phone) {
|
|
console.log(` Phone: ${result.phone}`);
|
|
stats.phonesAdded++;
|
|
}
|
|
|
|
if (!dryRun) {
|
|
await prisma.church.update({
|
|
where: { id: church.id },
|
|
data: {
|
|
website: result.website || church.website,
|
|
phone: result.phone || church.phone,
|
|
googlePlaceId: result.placeId || church.googlePlaceId,
|
|
hasWebsite: !!(result.website || church.website),
|
|
googleSearchedAt: new Date(),
|
|
},
|
|
});
|
|
if (result.website || result.phone) {
|
|
stats.enriched++;
|
|
}
|
|
}
|
|
} else {
|
|
console.log(' ✗ Not found on Google Places');
|
|
stats.notFound++;
|
|
|
|
// Mark as attempted so we don't re-query this church
|
|
if (!dryRun) {
|
|
await prisma.church.update({
|
|
where: { id: church.id },
|
|
data: { googleSearchedAt: new Date() },
|
|
});
|
|
}
|
|
}
|
|
|
|
// Rate limiting
|
|
await new Promise((resolve) => setTimeout(resolve, RATE_LIMIT_MS));
|
|
} catch (error: any) {
|
|
stats.errors++;
|
|
if (error.message === 'RATE_LIMITED') {
|
|
console.error(' ⚠ Rate limited, stopping enrichment');
|
|
break;
|
|
}
|
|
console.error(` ✗ Error: ${error.message}`);
|
|
}
|
|
|
|
// Job tracking: update progress every 10 items and check for stop
|
|
if (jobId && stats.processed % 10 === 0) {
|
|
await updateJobProgress(jobId, stats.processed, stats.enriched, stats.errors, stats.enriched, churches.length);
|
|
const stopping = await checkJobStopping(jobId);
|
|
if (stopping) {
|
|
console.log('\nJob stop requested via admin dashboard.');
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Progress update every 50 churches
|
|
if (stats.processed % 50 === 0) {
|
|
console.log('');
|
|
console.log(`Progress: ${stats.processed}/${churches.length} processed`);
|
|
console.log(` Enriched: ${stats.enriched}, Not found: ${stats.notFound}, Errors: ${stats.errors}`);
|
|
console.log('');
|
|
}
|
|
}
|
|
|
|
// Final job update
|
|
if (jobId) {
|
|
await updateJobProgress(jobId, stats.processed, stats.enriched, stats.errors, stats.enriched, churches.length);
|
|
}
|
|
|
|
return stats;
|
|
}
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
const countryIndex = args.indexOf('--country');
|
|
const limitIndex = args.indexOf('--limit');
|
|
const dryRun = args.includes('--dry-run');
|
|
const all = args.includes('--all');
|
|
|
|
const countryCode = countryIndex !== -1 ? args[countryIndex + 1] : undefined;
|
|
const limit = all ? undefined : limitIndex !== -1 ? parseInt(args[limitIndex + 1]) : 10;
|
|
|
|
if (!GOOGLE_PLACES_API_KEY) {
|
|
console.error('Error: GOOGLE_PLACES_API_KEY not set in environment');
|
|
console.error('Add it to your .env file');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log('============================================================');
|
|
console.log('Google Places Church Enrichment');
|
|
console.log('============================================================');
|
|
console.log(`Country: ${countryCode || 'All'}`);
|
|
console.log(`Limit: ${limit || 'No limit'}`);
|
|
console.log(`Dry run: ${dryRun ? 'Yes' : 'No'}`);
|
|
console.log('============================================================');
|
|
console.log('');
|
|
|
|
|
|
|
|
// Job tracking
|
|
let jobId = await createOrResumeJob(args);
|
|
if (!jobId && !dryRun) {
|
|
jobId = await createNewJob({ countryCode, limit, dryRun });
|
|
}
|
|
if (jobId) console.log(`Job ID: ${jobId}\n`);
|
|
|
|
const stats = await enrichChurches(countryCode, limit, dryRun, jobId);
|
|
|
|
console.log('');
|
|
console.log('============================================================');
|
|
console.log('Enrichment Summary');
|
|
console.log('============================================================');
|
|
console.log(`Churches processed: ${stats.processed}`);
|
|
console.log(`Churches enriched: ${stats.enriched}`);
|
|
console.log(`Not found on Google: ${stats.notFound}`);
|
|
console.log(`Websites added: ${stats.websitesAdded}`);
|
|
console.log(`Phone numbers added: ${stats.phonesAdded}`);
|
|
console.log(`Errors encountered: ${stats.errors}`);
|
|
console.log('============================================================');
|
|
|
|
// Complete job
|
|
if (jobId) {
|
|
await completeJob(jobId);
|
|
}
|
|
|
|
await prisma.$disconnect();
|
|
await pool.end();
|
|
}
|
|
|
|
main().catch((error) => {
|
|
console.error('Fatal error:', error);
|
|
process.exit(1);
|
|
});
|