From 5c7bc4cfed9896e7a9dba432c7b7b8d034049a8f Mon Sep 17 00:00:00 2001 From: albertfj114 Date: Thu, 19 Mar 2026 23:43:19 -0400 Subject: [PATCH] =?UTF-8?q?feat:=20add=20buscarmisas-network=20importer=20?= =?UTF-8?q?=E2=80=94=20sitemap=20discovery?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- scripts/import-buscarmisas-network.ts | 73 +++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/scripts/import-buscarmisas-network.ts b/scripts/import-buscarmisas-network.ts index e9a2f59..8b4f5d9 100644 --- a/scripts/import-buscarmisas-network.ts +++ b/scripts/import-buscarmisas-network.ts @@ -184,3 +184,76 @@ export function parseMassSchedule(html: string, countryCode: string): ParsedMass } return results; } + +// ─── HTTP Helpers ───────────────────────────────────────────────────────────── + +const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)'; +const REQUEST_DELAY_MS = 2_000; +const DOMAIN_DELAY_MS = 5_000; + +async function fetchText(url: string): Promise { + const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } }); + if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`); + return res.text(); +} + +async function fetchWithRetry(url: string, retries = 3): Promise { + for (let attempt = 1; attempt <= retries; attempt++) { + try { + return await fetchText(url); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + if (attempt === retries) throw err; + const isRetryable = msg.includes('429') || msg.includes('503'); + if (!isRetryable) throw err; + const backoff = attempt * 30_000; // 30s, 60s, 90s + console.warn(` [retry ${attempt}/${retries}] ${msg} — waiting ${backoff / 1000}s`); + await sleep(backoff); + } + } + throw new Error('unreachable'); +} + +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +// ─── Sitemap Discovery ──────────────────────────────────────────────────────── + +/** + * Fetch all church page URLs for a domain from its sitemap. + * Church URLs have exactly 3 path segments: /{region}/{city}/{slug}/ + */ +export async function getChurchUrls(domain: string, config: SiteConfig): Promise { + const indexUrl = `https://${domain}/sitemap_index.xml`; + console.log(`Fetching sitemap index: ${indexUrl}`); + const indexXml = await fetchWithRetry(indexUrl); + + // Extract child sitemap URLs matching the sitemapType + const childPattern = config.sitemapType === 'page' + ? /https:\/\/[^<]*\/page-sitemap\d*\.xml/g + : /https:\/\/[^<]*\/post-sitemap\.xml/g; + + const childUrls = [...indexXml.matchAll(childPattern)].map(m => m[0]); + console.log(` Found ${childUrls.length} child sitemaps`); + + const churchUrls: string[] = []; + for (const sitemapUrl of childUrls) { + const xml = await fetchWithRetry(sitemapUrl); + const locs = [...xml.matchAll(/([^<]+)<\/loc>/g)].map(m => m[1].trim()); + for (const loc of locs) { + // Church URLs: exactly 3 non-empty path segments after the domain + try { + const segments = new URL(loc).pathname.split('/').filter(Boolean); + if (segments.length === 3) { + churchUrls.push(loc); + } + } catch { /* skip malformed URLs */ } + } + } + + // Deduplicate + const unique = [...new Set(churchUrls)]; + console.log(` Total church URLs: ${unique.length}`); + return unique; +}