/* |-------------------------------------------------------------------------- | node ace make:command fix-dataset-cross-references | DONE: create commands/fix_dataset_cross_references.ts |-------------------------------------------------------------------------- */ import { BaseCommand, flags } from '@adonisjs/core/ace'; import type { CommandOptions } from '@adonisjs/core/types/ace'; import Dataset from '#models/dataset'; import DatasetReference from '#models/dataset_reference'; // import env from '#start/env'; interface MissingCrossReference { sourceDatasetId: number; targetDatasetId: number; sourcePublishId: number | null; targetPublishId: number | null; referenceType: string; relation: string; doi: string | null; reverseRelation: string; } export default class DetectMissingCrossReferences extends BaseCommand { static commandName = 'detect:missing-cross-references'; static description = 'Detect missing bidirectional cross-references between versioned datasets'; public static needsApplication = true; @flags.boolean({ alias: 'f', description: 'Fix missing cross-references automatically' }) public fix: boolean = false; @flags.boolean({ alias: 'v', description: 'Verbose output' }) public verbose: boolean = false; public static options: CommandOptions = { startApp: true, staysAlive: false, }; async run() { this.logger.info('🔍 Detecting missing cross-references...'); try { const missingReferences = await this.findMissingCrossReferences(); if (missingReferences.length === 0) { this.logger.success('All cross-references are properly linked!'); return; } this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s):`); for (const missing of missingReferences) { this.logger.info( `Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`, ); if (this.verbose) { this.logger.info(` - Reference type: ${missing.referenceType}`); this.logger.info(` - Relation: ${missing.relation}`); this.logger.info(` - DOI: ${missing.doi}`); } } if (this.fix) { await this.fixMissingReferences(missingReferences); this.logger.success('All missing cross-references have been fixed!'); } else { this.printMissingReferencesList(missingReferences); this.logger.info('💡 Run with --fix flag to automatically create missing cross-references'); } } catch (error) { this.logger.error('Error detecting missing cross-references:', error); process.exit(1); } } private async findMissingCrossReferences(): Promise { const missingReferences: { sourceDatasetId: number; targetDatasetId: number; sourcePublishId: number | null; targetPublishId: number | null; referenceType: string; relation: string; doi: string | null; reverseRelation: string; }[] = []; this.logger.info('📊 Querying dataset references...'); // Find all references that point to Tethys datasets (DOI or URL containing tethys DOI) // Only from datasets that are published const tethysReferences = await DatasetReference.query() .whereIn('type', ['DOI', 'URL']) .where((query) => { query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%'); }) .preload('dataset', (datasetQuery) => { datasetQuery.where('server_state', 'published'); }) .whereHas('dataset', (datasetQuery) => { datasetQuery.where('server_state', 'published'); }); this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets`); let processedCount = 0; for (const reference of tethysReferences) { processedCount++; if (this.verbose && processedCount % 10 === 0) { this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`); } // Extract dataset publish_id from DOI or URL const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value); if (!targetDatasetPublish) { if (this.verbose) { this.logger.warning(`⚠️ Could not extract publish ID from: ${reference.value}`); } continue; } // Check if target dataset exists and is published const targetDataset = await Dataset.query() .where('publish_id', targetDatasetPublish) .where('server_state', 'published') .first(); if (!targetDataset) { if (this.verbose) { this.logger.warning(`⚠️ Target dataset with publish_id ${targetDatasetPublish} not found or not published`); } continue; } // Ensure we have a valid source dataset with proper preloading if (!reference.dataset) { this.logger.warning(`⚠️ Source dataset ${reference.document_id} not properly loaded, skipping...`); continue; } // Check if reverse reference exists const reverseReferenceExists = await this.checkReverseReferenceExists( targetDataset.id, reference.document_id, reference.relation, ); if (!reverseReferenceExists) { missingReferences.push({ sourceDatasetId: reference.document_id, targetDatasetId: targetDataset.id, sourcePublishId: reference.dataset.publish_id || null, targetPublishId: targetDataset.publish_id || null, referenceType: reference.type, relation: reference.relation, doi: reference.value, reverseRelation: this.getReverseRelation(reference.relation), }); } } this.logger.info(`✅ Processed all ${processedCount} references`); return missingReferences; } private extractDatasetPublishIdFromReference(value: string): number | null { // Extract from DOI: https://doi.org/10.24341/tethys.107 -> 107 const doiMatch = value.match(/10\.24341\/tethys\.(\d+)/); if (doiMatch) { return parseInt(doiMatch[1]); } // Extract from URL: https://tethys.at/dataset/107 -> 107 const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/); if (urlMatch) { return parseInt(urlMatch[1]); } return null; } private async checkReverseReferenceExists( sourceDatasetId: number, targetDatasetId: number, originalRelation: string, ): Promise { const reverseRelation = this.getReverseRelation(originalRelation); // Only check for reverse references where the source dataset is also published const reverseReference = await DatasetReference.query() .where('document_id', sourceDatasetId) .where('related_document_id', targetDatasetId) .where('relation', reverseRelation) .whereHas('dataset', (datasetQuery) => { datasetQuery.where('server_state', 'published'); }) .first(); return !!reverseReference; } private getReverseRelation(relation: string): string { const relationMap: Record = { IsNewVersionOf: 'IsPreviousVersionOf', IsPreviousVersionOf: 'IsNewVersionOf', IsVersionOf: 'HasVersion', HasVersion: 'IsVersionOf', Compiles: 'IsCompiledBy', IsCompiledBy: 'Compiles', IsVariantFormOf: 'IsOriginalFormOf', IsOriginalFormOf: 'IsVariantFormOf', IsPartOf: 'HasPart', HasPart: 'IsPartOf', IsSupplementTo: 'IsSupplementedBy', IsSupplementedBy: 'IsSupplementTo', Continues: 'IsContinuedBy', IsContinuedBy: 'Continues', }; // to catch relation types like 'compiles' or 'IsVariantFormOf' that are not in the map mark reverse as 'HasVersion' return relationMap[relation] || 'HasVersion'; // Default fallback } private printMissingReferencesList(missingReferences: MissingCrossReference[]) { console.log('┌─────────────────────────────────────────────────────────────────────────────────┐'); console.log('│ MISSING CROSS-REFERENCES REPORT │'); console.log('│ (Published Datasets Only) │'); console.log('└─────────────────────────────────────────────────────────────────────────────────┘'); console.log(); missingReferences.forEach((missing, index) => { console.log( `${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}) → Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId})`, ); console.log(` ├─ Current relation: "${missing.relation}"`); console.log(` ├─ Missing reverse relation: "${missing.reverseRelation}"`); console.log(` ├─ Reference type: ${missing.referenceType}`); console.log(` └─ DOI/URL: ${missing.doi}`); console.log(); }); console.log('┌─────────────────────────────────────────────────────────────────────────────────┐'); console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected │`); console.log('└─────────────────────────────────────────────────────────────────────────────────┘'); } private async fixMissingReferences(missingReferences: MissingCrossReference[]) { this.logger.info('🔧 Creating missing cross-references in database...'); let fixedCount = 0; let errorCount = 0; for (const [index, missing] of missingReferences.entries()) { try { // Get the source dataset to create proper reference - ensure it's published const sourceDataset = await Dataset.query() .where('id', missing.sourceDatasetId) .where('server_state', 'published') .preload('identifier') .first(); if (!sourceDataset) { this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`); errorCount++; continue; } // Create the reverse reference const reverseReference = new DatasetReference(); reverseReference.document_id = missing.targetDatasetId; reverseReference.related_document_id = missing.sourceDatasetId; reverseReference.type = 'DOI'; reverseReference.relation = missing.reverseRelation; // Use the source dataset's DOI for the value if (sourceDataset.identifier?.value) { reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`; } else { // Fallback to dataset URL if no DOI reverseReference.value = `https://tethys.at/dataset/${sourceDataset.publish_id || missing.sourceDatasetId}`; } // Use the source dataset's main title for the label reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`; await reverseReference.save(); fixedCount++; if (this.verbose) { this.logger.info( `✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.targetDatasetId} -> ${missing.sourceDatasetId}`, ); } else if ((index + 1) % 10 === 0) { this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`); } } catch (error) { this.logger.error( `❌ Error creating reverse reference for datasets ${missing.targetDatasetId} -> ${missing.sourceDatasetId}:`, error, ); errorCount++; } } this.logger.info(`📊 Fix completed: ${fixedCount} created, ${errorCount} errors`); } }