scripts/import-msze-info.ts

#!/usr/bin/env tsx
/**
 * Import Catholic churches and mass schedules from msze.info (Poland)
 *
 * msze.info is a Polish directory of Catholic parishes with mass schedules.
 * The site uses numbered sitemaps (Churches1.xml through Churches11.xml)
 * with ~500 URLs each, containing both /kosciol/{id} (church pages) and
 * /msze-online/{slug} (livestream pages).
 *
 * Import strategy:
 *   1. Fetch all 11 sitemaps → extract /kosciol/{id} URLs (skip /msze-online/)
 *   2. For each church: fetch HTML, parse name/address/phone/website/schedule
 *   3. Extract coordinates from embedded tomtom_codeAddress() JS call
 *   4. Match against existing PL churches, upsert
 *
 * Usage:
 *   npx tsx scripts/import-msze-info.ts --all
 *   npx tsx scripts/import-msze-info.ts --all --dry-run
 *   npx tsx scripts/import-msze-info.ts --all --resume-from 500
 *   npx tsx scripts/import-msze-info.ts --all --job-id {uuid}
 */

import dotenv from 'dotenv';
import path from 'path';

dotenv.config({ path: path.resolve(process.cwd(), '.env.local') });
dotenv.config({ path: path.resolve(process.cwd(), '.env') });

import { Pool } from 'pg';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';

const dbUrl = process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/nearestmass';
console.log(`Connecting to database: ${dbUrl.replace(/:[^:@]+@/, ':***@')}`);
const pool = new Pool({
  connectionString: dbUrl,
  ssl: dbUrl.includes('neon') ? { rejectUnauthorized: false } : undefined,
});
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });

import { findDuplicateChurch } from '../src/lib/church-matcher';
import type { ExistingChurch } from '../src/lib/church-matcher';

// ─── Constants ───────────────────────────────────────────────────────────────

const SITE_BASE = 'https://www.msze.info';
const SITEMAP_COUNT = 11;
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
const REQUEST_DELAY_MS = 1500;

// ─── Types ───────────────────────────────────────────────────────────────────

interface ParsedChurch {
  name: string;
  address: string | null;
  city: string | null;
  zip: string | null;
  phone: string | null;
  website: string | null;
  email: string | null;
  latitude: number;
  longitude: number;
}

interface ParsedSchedule {
  dayOfWeek: number; // 0=Sun, 1=Mon, ..., 6=Sat
  time: string;      // "05:00", "18:30"
}

interface ImportStats {
  churchesFound: number;
  churchesMatched: number;
  churchesCreated: number;
  churchesSkipped: number;
  schedulesProcessed: number;
  massSchedulesCreated: number;
  errors: number;
}

interface CLIArgs {
  all: boolean;
  dryRun: boolean;
  resumeFrom?: number;
  jobId?: string;
}

// ─── HTTP Client ─────────────────────────────────────────────────────────────

let requestCount = 0;

function delay(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

async function fetchPage(url: string, delayMs: number = REQUEST_DELAY_MS): Promise<string | null> {
  if (requestCount > 0) {
    await delay(delayMs);
  }
  requestCount++;

  try {
    const response = await fetch(url, {
      headers: {
        'User-Agent': USER_AGENT,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      },
    });

    if (!response.ok) {
      console.error(`  HTTP ${response.status} for ${url}`);
      return null;
    }

    return await response.text();
  } catch (error) {
    console.error(`  Fetch error for ${url}: ${error instanceof Error ? error.message : error}`);
    return null;
  }
}

// ─── Sitemap Parser ──────────────────────────────────────────────────────────

async function fetchChurchUrlsFromSitemaps(): Promise<string[]> {
  const allIds: string[] = [];
  const seen = new Set<string>();

  for (let i = 1; i <= SITEMAP_COUNT; i++) {
    const sitemapUrl = `${SITE_BASE}/sitemap/Churches${i}.xml`;
    console.log(`  Fetching ${sitemapUrl}...`);
    const xml = await fetchPage(sitemapUrl);
    if (!xml) {
      console.error(`  Failed to fetch ${sitemapUrl}`);
      continue;
    }

    // Extract /kosciol/{id} URLs, skip /msze-online/
    const locRegex = /<loc>https?:\/\/(?:www\.)?msze\.info\/kosciol\/(\d+)<\/loc>/g;
    let match;
    while ((match = locRegex.exec(xml)) !== null) {
      const id = match[1];
      if (!seen.has(id)) {
        seen.add(id);
        allIds.push(id);
      }
    }
  }

  // Sort numerically for deterministic order
  allIds.sort((a, b) => parseInt(a) - parseInt(b));

  console.log(`Found ${allIds.length} unique church IDs from ${SITEMAP_COUNT} sitemaps`);
  return allIds;
}

// ─── HTML Parsers ────────────────────────────────────────────────────────────

function parseChurchPage(html: string): ParsedChurch {
  // Name: from <h1>Church Name, City</h1>
  const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
  let name = '';
  let cityFromH1: string | null = null;

  if (h1Match) {
    const raw = h1Match[1].replace(/<[^>]+>/g, '').trim();
    // Split "Church Name, City" — city is the last comma-separated part
    const lastComma = raw.lastIndexOf(',');
    if (lastComma > 0) {
      name = raw.substring(0, lastComma).trim();
      cityFromH1 = raw.substring(lastComma + 1).trim();
    } else {
      name = raw;
    }
  }

  // Address: look for "Adres:" or address-like patterns
  // Pattern: <span class="highlight">Adres:</span> <strong>Street, City</strong>
  let address: string | null = null;
  let city: string | null = cityFromH1;
  let zip: string | null = null;

  const addressMatch = html.match(/Adres:<\/span>\s*(?:<strong>)?([\s\S]*?)(?:<\/strong>|<br|<\/p)/i);
  if (addressMatch) {
    address = addressMatch[1]
      .replace(/<[^>]+>/g, '')
      .replace(/\s+/g, ' ')
      .trim() || null;
  }

  // Also try the tomtom_codeAddress first argument as fallback address
  if (!address) {
    const tomtomAddrMatch = html.match(/tomtom_codeAddress\s*\(\s*'([^']+)'/);
    if (tomtomAddrMatch) {
      address = tomtomAddrMatch[1].trim() || null;
    }
  }

  if (address) {
    // Extract Polish postal code (XX-XXX format)
    const zipMatch = address.match(/\b(\d{2}-\d{3})\b/);
    if (zipMatch) {
      zip = zipMatch[1];
    }

    // Extract city from address if not already from h1
    if (!city) {
      // City is typically the last part after comma
      const parts = address.split(',');
      if (parts.length > 1) {
        city = parts[parts.length - 1].replace(/\d{2}-\d{3}/, '').trim() || null;
      }
    }
  }

  // Coordinates: from tomtom_codeAddress('addr', zoom, 'name', null, lat, lng)
  let latitude = 0;
  let longitude = 0;
  const coordMatch = html.match(
    /tomtom_codeAddress\s*\([^,]+,\s*\d+\s*,\s*[^,]+,\s*(?:null|'[^']*')\s*,\s*(-?[\d.]+)\s*,\s*(-?[\d.]+)\s*\)/
  );
  if (coordMatch) {
    const lat = parseFloat(coordMatch[1]);
    const lng = parseFloat(coordMatch[2]);
    if (!isNaN(lat) && !isNaN(lng) && lat !== 0 && lng !== 0) {
      latitude = lat;
      longitude = lng;
    }
  }

  // Phone: <a href="tel:...">
  let phone: string | null = null;
  const phoneMatch = html.match(/<a\s+href="tel:([^"]+)"/i);
  if (phoneMatch) {
    phone = phoneMatch[1].trim() || null;
  }

  // Website: look for external link near "Witryna" text
  let website: string | null = null;
  const websiteMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>[^<]*Witryna/i);
  if (websiteMatch) {
    website = websiteMatch[1].trim() || null;
  }
  // Also try: link text that looks like a URL (www.xxx)
  if (!website) {
    const wwwMatch = html.match(/<a\s+href="(https?:\/\/[^"]+)"[^>]*>www\.[^<]+<\/a>/i);
    if (wwwMatch) {
      website = wwwMatch[1].trim() || null;
    }
  }

  // Email: not reliably available (Cloudflare-protected)
  const email: string | null = null;

  return { name, address, city, zip, phone, website, email, latitude, longitude };
}

function parseMassSchedule(html: string): ParsedSchedule[] {
  const schedules: ParsedSchedule[] = [];
  const seen = new Set<string>();

  // Find mass schedule sections by h2/h3 headings containing "MSZE"
  // Pattern: <h2>MSZE NIEDZIELE I ŚWIĘTA - Church Name</h2> followed by "godz. ..."
  // Pattern: <h3>MSZE DNI POWSZEDNIE - Church Name</h3> followed by "godz. ..."
  const sectionRegex = /<h[2-4][^>]*>([\s\S]*?)<\/h[2-4]>([\s\S]*?)(?=<h[2-4]|<footer|<script|$)/gi;
  let sectionMatch;

  while ((sectionMatch = sectionRegex.exec(html)) !== null) {
    const heading = sectionMatch[1].replace(/<[^>]+>/g, '').trim().toUpperCase();
    const content = sectionMatch[2];

    // Only process mass schedule headings (starts with "MSZE")
    if (!heading.startsWith('MSZE')) continue;

    // Determine which days this section covers
    const days = resolvePolishDays(heading);
    if (days.length === 0) continue;

    // Extract times from "godz." patterns
    const times = extractTimes(content);

    for (const day of days) {
      for (const time of times) {
        const key = `${day}:${time}`;
        if (seen.has(key)) continue;
        seen.add(key);
        schedules.push({ dayOfWeek: day, time });
      }
    }
  }

  return schedules;
}

function resolvePolishDays(heading: string): number[] {
  const h = heading; // already uppercased by caller

  // "NIEDZIELE I ŚWIĘTA" or just "NIEDZIEL" → Sunday
  if (h.includes('NIEDZIEL')) {
    return [0];
  }

  // "DNI POWSZEDNIE" → Weekdays (Mon-Sat)
  if (h.includes('DNI POWSZEDNIE') || h.includes('POWSZEDNI')) {
    return [1, 2, 3, 4, 5, 6];
  }

  // Individual day names (rare but possible)
  if (h.includes('PONIEDZIA')) return [1]; // poniedziałek
  if (h.includes('WTOREK') || h.includes('WTORK')) return [2];
  if (h.includes('ŚRODA') || h.includes('SRODA') || h.includes('ŚROD')) return [3];
  if (h.includes('CZWARTEK') || h.includes('CZWART')) return [4];
  if (h.includes('PIĄTEK') || h.includes('PIATEK') || h.includes('PIĄT')) return [5];
  if (h.includes('SOBOT')) return [6];

  return [];
}

function extractTimes(text: string): string[] {
  const times: string[] = [];

  // Match "godz." followed by times, or standalone HH:MM patterns
  // Handles: "godz. 6:30, 8:00, 9:30" and "godz. 7:00"
  const timeRegex = /(\d{1,2}):(\d{2})/g;
  let match;

  // Only look at text near "godz." patterns
  const godzSections = text.split(/godz\.\s*/i);

  for (let i = 1; i < godzSections.length; i++) {
    // Take text until the next section break (paragraph, div, heading)
    const section = godzSections[i].split(/<(?:p|div|br\s*\/?>|h[2-4])/i)[0];

    while ((match = timeRegex.exec(section)) !== null) {
      const hours = parseInt(match[1]);
      const mins = parseInt(match[2]);
      if (hours >= 0 && hours <= 23 && mins >= 0 && mins <= 59) {
        times.push(`${String(hours).padStart(2, '0')}:${String(mins).padStart(2, '0')}`);
      }
    }
  }

  return times;
}

// ─── Database Operations ─────────────────────────────────────────────────────

async function loadExistingPolishChurches(): Promise<ExistingChurch[]> {
  console.log('Loading existing Polish churches for deduplication...');
  const churches = await prisma.church.findMany({
    where: { country: 'PL' },
    select: {
      id: true,
      name: true,
      latitude: true,
      longitude: true,
      osmId: true,
      baiduId: true,
      masstimesId: true,
      orarimesseId: true,
      massSchedulesPhId: true,
      philmassId: true,
      horariosMisasId: true,
      mszeInfoId: true,
      weekdayMassesId: true,
      messesInfoId: true,
      bohosluzbyId: true,
      miserendId: true,
      kerknetId: true,
      gottesdienstzeitenId: true,
      source: true,
      website: true,
      phone: true,
      address: true,
    },
  });
  console.log(`Loaded ${churches.length} existing Polish churches`);
  return churches;
}

// ─── Import Logic ────────────────────────────────────────────────────────────

async function processChurch(
  churchId: string,
  existingChurches: ExistingChurch[],
  dryRun: boolean,
  stats: ImportStats,
): Promise<void> {
  stats.churchesFound++;

  const url = `${SITE_BASE}/kosciol/${churchId}`;
  const churchHtml = await fetchPage(url);
  if (!churchHtml) {
    stats.errors++;
    return;
  }

  const parsed = parseChurchPage(churchHtml);
  if (!parsed.name) {
    console.log(`  Skipping ${churchId}: no name found`);
    stats.churchesSkipped++;
    return;
  }

  const schedules = parseMassSchedule(churchHtml);

  // Build candidate for dedup
  const candidate = {
    name: parsed.name,
    lat: parsed.latitude,
    lng: parsed.longitude,
    mszeInfoId: churchId,
  };

  const duplicate = findDuplicateChurch(candidate, existingChurches);

  if (dryRun) {
    if (duplicate) {
      stats.churchesMatched++;
      console.log(`  [MATCH] "${parsed.name}" → existing "${duplicate.name}" (${duplicate.id})`);
    } else {
      stats.churchesCreated++;
      console.log(`  [NEW] "${parsed.name}" (${parsed.city || 'unknown city'})`);
    }
    if (schedules.length > 0) {
      stats.schedulesProcessed++;
      stats.massSchedulesCreated += schedules.length;
    }
    return;
  }

  if (duplicate) {
    // Update existing church
    stats.churchesMatched++;
    const updateData: Record<string, unknown> = {
      mszeInfoId: churchId,
    };

    if (!duplicate.address && parsed.address) updateData.address = parsed.address;
    if (!duplicate.phone && parsed.phone) updateData.phone = parsed.phone;
    if (!duplicate.website && parsed.website) {
      updateData.website = parsed.website;
      updateData.hasWebsite = true;
    }

    // Update coordinates if existing has none and we have them
    if (duplicate.latitude === 0 && duplicate.longitude === 0 && parsed.latitude !== 0) {
      updateData.latitude = parsed.latitude;
      updateData.longitude = parsed.longitude;
    }

    // Fill city/zip if not set
    const dbRecord = await prisma.church.findUnique({
      where: { id: duplicate.id },
      select: { city: true, zip: true, email: true },
    });
    if (dbRecord && !dbRecord.city && parsed.city) updateData.city = parsed.city;
    if (dbRecord && !dbRecord.zip && parsed.zip) updateData.zip = parsed.zip;
    if (dbRecord && !dbRecord.email && parsed.email) updateData.email = parsed.email;

    try {
      await prisma.church.update({
        where: { id: duplicate.id },
        data: updateData,
      });
    } catch (error) {
      if (error instanceof Error && error.message.includes('Unique constraint')) {
        stats.churchesSkipped++;
        return;
      }
      throw error;
    }

    // Replace mass schedules
    if (schedules.length > 0) {
      try {
        await prisma.$transaction(async (tx) => {
          await tx.massSchedule.deleteMany({ where: { churchId: duplicate.id } });
          await tx.massSchedule.createMany({
            data: schedules.map((s) => ({
              churchId: duplicate.id,
              dayOfWeek: s.dayOfWeek,
              time: s.time,
              language: 'Polish',
            })),
          });
          await tx.church.update({
            where: { id: duplicate.id },
            data: { lastScrapedAt: new Date() },
          });
        });
        stats.schedulesProcessed++;
        stats.massSchedulesCreated += schedules.length;
      } catch (error) {
        stats.errors++;
        console.error(`    Error saving schedules for ${churchId}: ${error instanceof Error ? error.message : error}`);
      }
    }
  } else {
    // Create new church
    try {
      const newChurch = await prisma.church.create({
        data: {
          name: parsed.name,
          latitude: parsed.latitude,
          longitude: parsed.longitude,
          address: parsed.address,
          zip: parsed.zip,
          city: parsed.city,
          country: 'PL',
          phone: parsed.phone,
          website: parsed.website,
          email: parsed.email,
          hasWebsite: !!parsed.website,
          mszeInfoId: churchId,
          source: 'msze-info',
        },
      });
      stats.churchesCreated++;

      // Add to in-memory array for within-run dedup
      existingChurches.push({
        id: newChurch.id,
        name: parsed.name,
        latitude: parsed.latitude,
        longitude: parsed.longitude,
        osmId: null,
        baiduId: null,
        masstimesId: null,
        orarimesseId: null,
        massSchedulesPhId: null,
        philmassId: null,
        horariosMisasId: null,
        mszeInfoId: churchId,
        weekdayMassesId: null,
        messesInfoId: null,
        bohosluzbyId: null,
        miserendId: null,
        kerknetId: null,
        gottesdienstzeitenId: null,
        source: 'msze-info',
        website: parsed.website,
        phone: parsed.phone,
        address: parsed.address,
      });

      // Create mass schedules
      if (schedules.length > 0) {
        await prisma.massSchedule.createMany({
          data: schedules.map((s) => ({
            churchId: newChurch.id,
            dayOfWeek: s.dayOfWeek,
            time: s.time,
            language: 'Polish',
          })),
        });
        await prisma.church.update({
          where: { id: newChurch.id },
          data: { lastScrapedAt: new Date() },
        });
        stats.schedulesProcessed++;
        stats.massSchedulesCreated += schedules.length;
      }
    } catch (error) {
      if (error instanceof Error && error.message.includes('Unique constraint')) {
        stats.churchesSkipped++;
        return;
      }
      throw error;
    }
  }
}

// ─── CLI ─────────────────────────────────────────────────────────────────────

function parseArgs(): CLIArgs {
  const args = process.argv.slice(2);
  const result: CLIArgs = {
    all: false,
    dryRun: false,
  };

  for (let i = 0; i < args.length; i++) {
    switch (args[i]) {
      case '--all':
        result.all = true;
        break;
      case '--dry-run':
        result.dryRun = true;
        break;
      case '--resume-from':
        result.resumeFrom = parseInt(args[++i]);
        break;
      case '--job-id':
        result.jobId = args[++i];
        break;
      case '--help':
      case '-h':
        console.log(`
Usage: npx tsx scripts/import-msze-info.ts [options]

Options:
  --all                  Import all churches from sitemaps
  --dry-run              No database writes, just report what would happen
  --resume-from <n>      Skip first N churches
  --job-id <uuid>        Background job tracking ID
  --help, -h             Show this help message

Examples:
  npx tsx scripts/import-msze-info.ts --all --dry-run
  npx tsx scripts/import-msze-info.ts --all
  npx tsx scripts/import-msze-info.ts --all --resume-from 500
`);
        process.exit(0);
    }
  }

  if (!result.all) {
    console.error('Error: specify --all');
    process.exit(1);
  }

  return result;
}

// ─── Helpers ─────────────────────────────────────────────────────────────────

function formatDuration(ms: number): string {
  const seconds = Math.floor(ms / 1000);
  const minutes = Math.floor(seconds / 60);
  const hours = Math.floor(minutes / 60);
  if (hours > 0) return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
  if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
  return `${seconds}s`;
}

// ─── Main ────────────────────────────────────────────────────────────────────

async function main() {
  const args = parseArgs();
  const startTime = Date.now();

  console.log('\n' + '='.repeat(70));
  console.log('MSZE.INFO (POLAND) IMPORTER');
  console.log('='.repeat(70));
  console.log(`Mode:            All churches from sitemaps`);
  console.log(`Dry run:         ${args.dryRun ? 'YES (no DB writes)' : 'NO'}`);
  if (args.resumeFrom) console.log(`Resume from:     ${args.resumeFrom}`);
  console.log(`Time:            ${new Date().toISOString()}`);
  console.log('='.repeat(70) + '\n');

  // Update background job status if provided
  if (args.jobId) {
    try {
      await prisma.backgroundJob.update({
        where: { id: args.jobId },
        data: { status: 'running', startedAt: new Date() },
      });
    } catch {
      // Job might not exist yet
    }
  }

  const stats: ImportStats = {
    churchesFound: 0,
    churchesMatched: 0,
    churchesCreated: 0,
    churchesSkipped: 0,
    schedulesProcessed: 0,
    massSchedulesCreated: 0,
    errors: 0,
  };

  // Load existing Polish churches for dedup
  const existingChurches = await loadExistingPolishChurches();

  // Fetch church IDs from sitemaps
  console.log('Fetching church URLs from sitemaps...');
  let churchIds = await fetchChurchUrlsFromSitemaps();

  // Handle --resume-from
  if (args.resumeFrom) {
    const before = churchIds.length;
    churchIds = churchIds.slice(args.resumeFrom);
    console.log(`Resuming from index ${args.resumeFrom} (skipping ${before - churchIds.length} churches)\n`);
  } else {
    console.log(`Processing ${churchIds.length} churches\n`);
  }

  // Process each church
  for (let i = 0; i < churchIds.length; i++) {
    const id = churchIds[i];
    const elapsed = formatDuration(Date.now() - startTime);
    console.log(`[${i + 1}/${churchIds.length}] kosciol/${id} [${elapsed} elapsed]`);

    try {
      await processChurch(id, existingChurches, args.dryRun, stats);
    } catch (error) {
      stats.errors++;
      console.error(`  ERROR processing ${id}: ${error instanceof Error ? error.message : error}`);
    }
  }

  // Print summary
  const totalTime = Date.now() - startTime;
  console.log('\n' + '='.repeat(70));
  console.log(`IMPORT SUMMARY ${args.dryRun ? '(DRY RUN)' : ''}`);
  console.log('='.repeat(70));
  console.log(`Churches found:         ${stats.churchesFound}`);
  console.log(`  Matched (existing):   ${stats.churchesMatched}`);
  console.log(`  Created (new):        ${stats.churchesCreated}`);
  console.log(`  Skipped:              ${stats.churchesSkipped}`);
  console.log(`Schedules processed:    ${stats.schedulesProcessed}`);
  console.log(`Mass schedules created: ${stats.massSchedulesCreated}`);
  console.log(`Errors:                 ${stats.errors}`);
  console.log(`Total time:             ${formatDuration(totalTime)}`);
  console.log(`HTTP requests:          ${requestCount}`);
  console.log('='.repeat(70) + '\n');

  // Update background job
  if (args.jobId) {
    try {
      await prisma.backgroundJob.update({
        where: { id: args.jobId },
        data: {
          status: stats.errors > 0 ? 'completed_with_errors' : 'completed',
          completedAt: new Date(),
          result: JSON.stringify(stats),
        },
      });
    } catch {
      // Ignore
    }
  }
}

main()
  .catch((error) => {
    console.error('Fatal error:', error);
    process.exit(1);
  })
  .finally(async () => {
    await prisma.$disconnect();
    await pool.end();
  });