fix: clean up church-matcher types and add HK OSM bounding box
- Remove discovermassId/buscarmisasNetworkId from findDuplicateChurch match passes (importers now do their own pre-check dedup); restore as optional fields on ExistingChurch to keep type/runtime in sync - Add HK bounding box to COUNTRY_BOUNDING_BOXES; fix silent 0-result fallback when country query returns empty from mirror server - discovermass importer: add --limit flag and skip-already-imported pre-check using importedSlugs set - Import scripts: remove discovermassId from ExistingChurch select/stubs (field not needed in shared matcher context) - Schema: reorder discovermassId/kerknetId/gottesdienstzeitenId fields Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -94,6 +94,7 @@ interface CLIArgs {
|
||||
all: boolean;
|
||||
dryRun: boolean;
|
||||
resumeFrom?: number;
|
||||
limit?: number;
|
||||
jobId?: string;
|
||||
}
|
||||
|
||||
@@ -507,6 +508,7 @@ function parseCLIArgs(): CLIArgs {
|
||||
case '--all': result.all = true; break;
|
||||
case '--dry-run': result.dryRun = true; break;
|
||||
case '--resume-from': result.resumeFrom = parseInt(args[++i], 10); break;
|
||||
case '--limit': result.limit = parseInt(args[++i], 10); break;
|
||||
case '--job-id': result.jobId = args[++i]; break;
|
||||
}
|
||||
}
|
||||
@@ -540,14 +542,25 @@ async function main() {
|
||||
try {
|
||||
const urls = await getAllChurchUrls();
|
||||
const existingChurches = await loadExistingChurches();
|
||||
|
||||
// Skip already-imported churches — check discovermassId set in DB
|
||||
const importedSlugs = new Set(
|
||||
existingChurches.filter(c => c.discovermassId).map(c => c.discovermassId!)
|
||||
);
|
||||
|
||||
// Apply --resume-from first, then filter to unimported, then apply --limit
|
||||
const startIdx = args.resumeFrom ?? 0;
|
||||
const churchUrls = urls.slice(startIdx);
|
||||
console.log(`\nProcessing ${churchUrls.length} churches (starting from index ${startIdx})...\n`);
|
||||
const candidateUrls = urls.slice(startIdx).filter(url => {
|
||||
const slug = url.replace('https://discovermass.com/church/', '').replace(/\/$/, '');
|
||||
return !importedSlugs.has(slug);
|
||||
});
|
||||
const churchUrls = args.limit ? candidateUrls.slice(0, args.limit) : candidateUrls;
|
||||
|
||||
console.log(`\nSitemap total: ${urls.length} | Already imported: ${importedSlugs.size} | This run: ${churchUrls.length}${args.limit ? ` (limit ${args.limit})` : ''}\n`);
|
||||
|
||||
for (let i = 0; i < churchUrls.length; i++) {
|
||||
const url = churchUrls[i];
|
||||
const overallIdx = startIdx + i;
|
||||
console.log(`[${overallIdx + 1}/${urls.length}] ${url}`);
|
||||
console.log(`[${i + 1}/${churchUrls.length}] ${url}`);
|
||||
await processChurch(url, existingChurches, args, stats);
|
||||
if (i < churchUrls.length - 1) {
|
||||
await sleep(REQUEST_DELAY_MS);
|
||||
|
||||
Reference in New Issue
Block a user