/* |-------------------------------------------------------------------------- | node ace make:command fix-dataset-cross-references | DONE: create commands/fix_dataset_cross_references.ts |-------------------------------------------------------------------------- */ import { BaseCommand, flags } from '@adonisjs/core/ace'; import type { CommandOptions } from '@adonisjs/core/types/ace'; import { DateTime } from 'luxon'; import Dataset from '#models/dataset'; import DatasetReference from '#models/dataset_reference'; // import env from '#start/env'; interface MissingCrossReference { sourceDatasetId: number; targetDatasetId: number; sourcePublishId: number | null; targetPublishId: number | null; sourceDoi: string | null; targetDoi: string | null; referenceType: string; relation: string; doi: string | null; reverseRelation: string; } export default class DetectMissingCrossReferences extends BaseCommand { static commandName = 'detect:missing-cross-references'; static description = 'Detect missing bidirectional cross-references between versioned datasets'; public static needsApplication = true; @flags.boolean({ alias: 'f', description: 'Fix missing cross-references automatically' }) public fix: boolean = false; @flags.boolean({ alias: 'v', description: 'Verbose output' }) public verbose: boolean = false; @flags.number({ alias: 'p', description: 'Filter by specific publish_id (source or target dataset)' }) public publish_id?: number; // example: node ace detect:missing-cross-references --verbose -p 227 //if you want to filter by specific publish_id with details // example: node ace detect:missing-cross-references --verbose // example: node ace detect:missing-cross-references --fix -p 227 //if you want to filter by specific publish_id and fix it // example: node ace detect:missing-cross-references public static options: CommandOptions = { startApp: true, staysAlive: false, }; // Define the allowed relations that we want to process private readonly ALLOWED_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf', 'IsVariantFormOf', 'IsOriginalFormOf']; async run() { this.logger.info('🔍 Detecting missing cross-references...'); this.logger.info(`📋 Processing only these relations: ${this.ALLOWED_RELATIONS.join(', ')}`); if (this.publish_id) { this.logger.info(`Filtering by publish_id: ${this.publish_id}`); } try { const missingReferences = await this.findMissingCrossReferences(); if (missingReferences.length === 0) { const filterMsg = this.publish_id ? ` for publish_id ${this.publish_id}` : ''; this.logger.success(`All cross-references are properly linked for the specified relations${filterMsg}!`); return; } const filterMsg = this.publish_id ? ` (filtered by publish_id ${this.publish_id})` : ''; this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s)${filterMsg}:`); // Show brief list if not verbose mode if (!this.verbose) { for (const missing of missingReferences) { const sourceDoi = missing.sourceDoi ? ` DOI: ${missing.sourceDoi}` : ''; const targetDoi = missing.targetDoi ? ` DOI: ${missing.targetDoi}` : ''; this.logger.info( `Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}${sourceDoi}) ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId}${targetDoi}) → missing reverse: ${missing.reverseRelation}`, ); } } else { // Verbose mode - show detailed info for (const missing of missingReferences) { this.logger.info( `Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`, ); this.logger.info(` - Reference type: ${missing.referenceType}`); this.logger.info(` - Relation: ${missing.relation}`); this.logger.info(` - DOI: ${missing.doi}`); } } if (this.fix) { await this.fixMissingReferences(missingReferences); this.logger.success('All missing cross-references have been fixed!'); } else { if (this.verbose) { this.printMissingReferencesList(missingReferences); } this.logger.info('💡 Run with --fix flag to automatically create missing cross-references'); if (this.publish_id) { this.logger.info(`🎯 Currently filtering by publish_id: ${this.publish_id}`); } } } catch (error) { this.logger.error('Error detecting missing cross-references:', error); process.exit(1); } } private async findMissingCrossReferences(): Promise { const missingReferences: { sourceDatasetId: number; targetDatasetId: number; sourcePublishId: number | null; targetPublishId: number | null; sourceDoi: string | null; targetDoi: string | null; referenceType: string; relation: string; doi: string | null; reverseRelation: string; }[] = []; this.logger.info('📊 Querying dataset references...'); // Find all references that point to Tethys datasets (DOI or URL containing tethys DOI) // Only from datasets that are published AND only for allowed relations const tethysReferencesQuery = DatasetReference.query() .whereIn('type', ['DOI', 'URL']) .whereIn('relation', this.ALLOWED_RELATIONS) // Only process allowed relations .where((query) => { query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%'); }) .preload('dataset', (datasetQuery) => { datasetQuery.preload('identifier'); }) .whereHas('dataset', (datasetQuery) => { datasetQuery.where('server_state', 'published'); }); if (typeof this.publish_id === 'number') { tethysReferencesQuery.whereHas('dataset', (datasetQuery) => { datasetQuery.where('publish_id', this.publish_id as number); }); } const tethysReferences = await tethysReferencesQuery.exec(); this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets (allowed relations only)`); let processedCount = 0; let skippedCount = 0; for (const reference of tethysReferences) { processedCount++; if (this.verbose && processedCount % 10 === 0) { this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`); } // Double-check that this relation is in our allowed list (safety check) if (!this.ALLOWED_RELATIONS.includes(reference.relation)) { skippedCount++; if (this.verbose) { this.logger.info(`⏭️ Skipping relation "${reference.relation}" - not in allowed list`); } continue; } // Extract dataset publish_id from DOI or URL const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value); if (!targetDatasetPublish) { if (this.verbose) { this.logger.warning(`⚠️ Could not extract publish ID from: ${reference.value}`); } continue; } // Check if target dataset exists and is published const targetDataset = await Dataset.query() .where('publish_id', targetDatasetPublish) .where('server_state', 'published') .preload('identifier') .first(); if (!targetDataset) { if (this.verbose) { this.logger.warning(`⚠️ Target dataset with publish_id ${targetDatasetPublish} not found or not published`); } continue; } // Ensure we have a valid source dataset with proper preloading if (!reference.dataset) { this.logger.warning(`⚠️ Source dataset ${reference.document_id} not properly loaded, skipping...`); continue; } // Check if reverse reference exists const reverseReferenceExists = await this.checkReverseReferenceExists( targetDataset.id, // reference.document_id, reference.relation, ); if (!reverseReferenceExists) { const reverseRelation = this.getReverseRelation(reference.relation); if (reverseRelation) { // Only add if we have a valid reverse relation missingReferences.push({ sourceDatasetId: reference.document_id, targetDatasetId: targetDataset.id, sourcePublishId: reference.dataset.publish_id || null, targetPublishId: targetDataset.publish_id || null, referenceType: reference.type, relation: reference.relation, doi: reference.value, reverseRelation: reverseRelation, sourceDoi: reference.dataset.identifier ? reference.dataset.identifier.value : null, targetDoi: targetDataset.identifier ? targetDataset.identifier.value : null, }); } } } this.logger.info(`✅ Processed ${processedCount} references (${skippedCount} skipped due to relation filtering)`); return missingReferences; } private extractDatasetPublishIdFromReference(value: string): number | null { // Extract from DOI: https://doi.org/10.24341/tethys.107 -> 107 const doiMatch = value.match(/10\.24341\/tethys\.(\d+)/); if (doiMatch) { return parseInt(doiMatch[1]); } // Extract from URL: https://tethys.at/dataset/107 -> 107 const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/); if (urlMatch) { return parseInt(urlMatch[1]); } return null; } private async checkReverseReferenceExists(targetDatasetId: number, originalRelation: string): Promise { const reverseRelation = this.getReverseRelation(originalRelation); if (!reverseRelation) { return true; // If no reverse relation is defined, consider it as "exists" to skip processing } // Only check for reverse references where the source dataset is also published const reverseReference = await DatasetReference.query() // We don't filter by source document_id here to find any incoming reference from any published dataset // .where('document_id', sourceDatasetId) .where('related_document_id', targetDatasetId) .where('relation', reverseRelation) .first(); return !!reverseReference; } private getReverseRelation(relation: string): string | null { const relationMap: Record = { IsNewVersionOf: 'IsPreviousVersionOf', IsPreviousVersionOf: 'IsNewVersionOf', IsVariantFormOf: 'IsOriginalFormOf', IsOriginalFormOf: 'IsVariantFormOf', }; // Only return reverse relation if it exists in our map, otherwise return null return relationMap[relation] || null; } private printMissingReferencesList(missingReferences: MissingCrossReference[]) { console.log('┌─────────────────────────────────────────────────────────────────────────────────┐'); console.log('│ MISSING CROSS-REFERENCES REPORT │'); console.log('│ (Published Datasets Only - Filtered Relations) │'); console.log('└─────────────────────────────────────────────────────────────────────────────────┘'); console.log(); missingReferences.forEach((missing, index) => { console.log( `${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId} Identifier: ${missing.sourceDoi}) ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId} Identifier: ${missing.targetDoi})`, ); console.log(` ├─ Current relation: "${missing.relation}"`); console.log(` ├─ Missing reverse relation: "${missing.reverseRelation}"`); console.log(` ├─ Reference type: ${missing.referenceType}`); console.log(` └─ DOI/URL: ${missing.doi}`); console.log(); }); console.log('┌─────────────────────────────────────────────────────────────────────────────────┐'); console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected │`); console.log(`│ Processed relations: ${this.ALLOWED_RELATIONS.join(', ')} │`); console.log('└─────────────────────────────────────────────────────────────────────────────────┘'); } private async fixMissingReferences(missingReferences: MissingCrossReference[]) { this.logger.info('🔧 Creating missing cross-references in database...'); let fixedCount = 0; let errorCount = 0; for (const [index, missing] of missingReferences.entries()) { try { // Get both source and target datasets const sourceDataset = await Dataset.query() .where('id', missing.sourceDatasetId) .where('server_state', 'published') .preload('identifier') .first(); const targetDataset = await Dataset.query().where('id', missing.targetDatasetId).where('server_state', 'published').first(); if (!sourceDataset) { this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`); errorCount++; continue; } if (!targetDataset) { this.logger.warning(`⚠️ Target dataset ${missing.targetDatasetId} not found or not published, skipping...`); errorCount++; continue; } // Create the reverse reference using the referenced_by relationship // Example: If Dataset 297 IsNewVersionOf Dataset 144 // We create an incoming reference for Dataset 144 that shows Dataset 297 IsPreviousVersionOf it const reverseReference = new DatasetReference(); // Don't set document_id - this creates an incoming reference via related_document_id reverseReference.related_document_id = missing.targetDatasetId; // 144 (dataset receiving the incoming reference) reverseReference.type = 'DOI'; reverseReference.relation = missing.reverseRelation; // Use the source dataset's DOI for the value (what's being referenced) if (sourceDataset.identifier?.value) { reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`; } else { // Fallback to dataset URL if no DOI reverseReference.value = `https://tethys.at/dataset/${sourceDataset.publish_id || missing.sourceDatasetId}`; } // Use the source dataset's main title for the label reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`; // Also save 'server_date_modified' on target dataset to trigger any downstream updates (e.g. search index) targetDataset.server_date_modified = DateTime.now(); await targetDataset.save(); await reverseReference.save(); fixedCount++; if (this.verbose) { this.logger.info( `✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.sourceDatasetId} -> ${missing.targetDatasetId} (${missing.reverseRelation})`, ); } else if ((index + 1) % 10 === 0) { this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`); } } catch (error) { this.logger.error( `❌ Error creating reverse reference for datasets ${missing.targetDatasetId} -> ${missing.sourceDatasetId}:`, error, ); errorCount++; } } this.logger.info(`📊 Fix completed: ${fixedCount} created, ${errorCount} errors`); } }