feat: add buscarmisas-network importer — sitemap discovery
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -184,3 +184,76 @@ export function parseMassSchedule(html: string, countryCode: string): ParsedMass
|
|||||||
}
|
}
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── HTTP Helpers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const USER_AGENT = 'NearestMass-Importer/1.0 (parish data aggregator; contact: privacy@nearestmass.com)';
|
||||||
|
const REQUEST_DELAY_MS = 2_000;
|
||||||
|
const DOMAIN_DELAY_MS = 5_000;
|
||||||
|
|
||||||
|
async function fetchText(url: string): Promise<string> {
|
||||||
|
const res = await fetch(url, { headers: { 'User-Agent': USER_AGENT } });
|
||||||
|
if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
|
||||||
|
return res.text();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchWithRetry(url: string, retries = 3): Promise<string> {
|
||||||
|
for (let attempt = 1; attempt <= retries; attempt++) {
|
||||||
|
try {
|
||||||
|
return await fetchText(url);
|
||||||
|
} catch (err) {
|
||||||
|
const msg = err instanceof Error ? err.message : String(err);
|
||||||
|
if (attempt === retries) throw err;
|
||||||
|
const isRetryable = msg.includes('429') || msg.includes('503');
|
||||||
|
if (!isRetryable) throw err;
|
||||||
|
const backoff = attempt * 30_000; // 30s, 60s, 90s
|
||||||
|
console.warn(` [retry ${attempt}/${retries}] ${msg} — waiting ${backoff / 1000}s`);
|
||||||
|
await sleep(backoff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new Error('unreachable');
|
||||||
|
}
|
||||||
|
|
||||||
|
function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Sitemap Discovery ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch all church page URLs for a domain from its sitemap.
|
||||||
|
* Church URLs have exactly 3 path segments: /{region}/{city}/{slug}/
|
||||||
|
*/
|
||||||
|
export async function getChurchUrls(domain: string, config: SiteConfig): Promise<string[]> {
|
||||||
|
const indexUrl = `https://${domain}/sitemap_index.xml`;
|
||||||
|
console.log(`Fetching sitemap index: ${indexUrl}`);
|
||||||
|
const indexXml = await fetchWithRetry(indexUrl);
|
||||||
|
|
||||||
|
// Extract child sitemap URLs matching the sitemapType
|
||||||
|
const childPattern = config.sitemapType === 'page'
|
||||||
|
? /https:\/\/[^<]*\/page-sitemap\d*\.xml/g
|
||||||
|
: /https:\/\/[^<]*\/post-sitemap\.xml/g;
|
||||||
|
|
||||||
|
const childUrls = [...indexXml.matchAll(childPattern)].map(m => m[0]);
|
||||||
|
console.log(` Found ${childUrls.length} child sitemaps`);
|
||||||
|
|
||||||
|
const churchUrls: string[] = [];
|
||||||
|
for (const sitemapUrl of childUrls) {
|
||||||
|
const xml = await fetchWithRetry(sitemapUrl);
|
||||||
|
const locs = [...xml.matchAll(/<loc>([^<]+)<\/loc>/g)].map(m => m[1].trim());
|
||||||
|
for (const loc of locs) {
|
||||||
|
// Church URLs: exactly 3 non-empty path segments after the domain
|
||||||
|
try {
|
||||||
|
const segments = new URL(loc).pathname.split('/').filter(Boolean);
|
||||||
|
if (segments.length === 3) {
|
||||||
|
churchUrls.push(loc);
|
||||||
|
}
|
||||||
|
} catch { /* skip malformed URLs */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deduplicate
|
||||||
|
const unique = [...new Set(churchUrls)];
|
||||||
|
console.log(` Total church URLs: ${unique.length}`);
|
||||||
|
return unique;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user