tethys.backend/commands/fix_dataset_cross_references.ts
Arno Kaimbacher b5bbe26ec2
Some checks failed
build.yaml / feat: Enhance background job settings UI and functionality (push) Failing after 0s
feat: Enhance background job settings UI and functionality
- Updated BackgroundJob.vue to improve the display of background job statuses, including missing cross-references and current job mode.
- Added auto-refresh functionality for background job status.
- Introduced success toast notifications for successful status refreshes.
- Modified the XML serialization process in DatasetXmlSerializer for better caching and performance.
- Implemented a new RuleProvider for managing custom validation rules.
- Improved error handling in routes for loading background job settings.
- Enhanced ClamScan configuration with socket support for virus scanning.
- Refactored dayjs utility to streamline locale management.
2025-10-14 12:19:09 +02:00

482 lines
22 KiB
TypeScript

/*
|--------------------------------------------------------------------------
| node ace make:command fix-dataset-cross-references
| DONE: create commands/fix_dataset_cross_references.ts
|--------------------------------------------------------------------------
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import type { CommandOptions } from '@adonisjs/core/types/ace';
import { DateTime } from 'luxon';
import Dataset from '#models/dataset';
import DatasetReference from '#models/dataset_reference';
import AppConfig from '#models/appconfig';
// import env from '#start/env';
interface MissingCrossReference {
sourceDatasetId: number;
targetDatasetId: number;
sourcePublishId: number | null;
targetPublishId: number | null;
sourceDoi: string | null;
targetDoi: string | null;
referenceType: string;
relation: string;
doi: string | null;
reverseRelation: string;
sourceReferenceLabel: string | null;
}
export default class DetectMissingCrossReferences extends BaseCommand {
static commandName = 'detect:missing-cross-references';
static description = 'Detect missing bidirectional cross-references between versioned datasets';
public static needsApplication = true;
@flags.boolean({ alias: 'f', description: 'Fix missing cross-references automatically' })
public fix: boolean = false;
@flags.boolean({ alias: 'v', description: 'Verbose output' })
public verbose: boolean = false;
@flags.number({ alias: 'p', description: 'Filter by specific publish_id (source or target dataset)' })
public publish_id?: number;
// example: node ace detect:missing-cross-references --verbose -p 227 //if you want to filter by specific publish_id with details
// example: node ace detect:missing-cross-references --verbose
// example: node ace detect:missing-cross-references --fix -p 227 //if you want to filter by specific publish_id and fix it
// example: node ace detect:missing-cross-references
public static options: CommandOptions = {
startApp: true,
staysAlive: false,
};
// Define the allowed relations that we want to process
private readonly ALLOWED_RELATIONS = [
'IsNewVersionOf',
'IsPreviousVersionOf',
'IsVariantFormOf',
'IsOriginalFormOf',
'Continues',
'IsContinuedBy',
'HasPart',
'IsPartOf',
];
// private readonly ALLOWED_RELATIONS = ['IsPreviousVersionOf', 'IsOriginalFormOf'];
async run() {
this.logger.info('🔍 Detecting missing cross-references...');
this.logger.info(`📋 Processing only these relations: ${this.ALLOWED_RELATIONS.join(', ')}`);
if (this.publish_id) {
this.logger.info(`Filtering by publish_id: ${this.publish_id}`);
}
try {
const missingReferences = await this.findMissingCrossReferences();
// Store count in AppConfig if not fixing and count >= 1
if (!this.fix && missingReferences.length >= 1) {
await this.storeMissingCrossReferencesCount(missingReferences.length);
}
if (missingReferences.length === 0) {
const filterMsg = this.publish_id ? ` for publish_id ${this.publish_id}` : '';
this.logger.success(`All cross-references are properly linked for the specified relations${filterMsg}!`);
// Clear the count if no missing references
if (!this.fix) {
await this.storeMissingCrossReferencesCount(0);
}
return;
}
const filterMsg = this.publish_id ? ` (filtered by publish_id ${this.publish_id})` : '';
this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s)${filterMsg}:`);
// Show brief list if not verbose mode
if (!this.verbose) {
for (const missing of missingReferences) {
const sourceDoi = missing.sourceDoi ? ` DOI: ${missing.sourceDoi}` : '';
const targetDoi = missing.targetDoi ? ` DOI: ${missing.targetDoi}` : '';
this.logger.info(
`Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}${sourceDoi}) ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId}${targetDoi}) → missing reverse: ${missing.reverseRelation}`,
);
}
} else {
// Verbose mode - show detailed info
for (const missing of missingReferences) {
this.logger.info(
`Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
);
this.logger.info(` - Reference type: ${missing.referenceType}`);
this.logger.info(` - Relation: ${missing.relation}`);
this.logger.info(` - DOI: ${missing.doi}`);
}
}
if (this.fix) {
await this.fixMissingReferences(missingReferences);
// Clear the count after fixing
await this.storeMissingCrossReferencesCount(0);
this.logger.success('All missing cross-references have been fixed!');
} else {
if (this.verbose) {
this.printMissingReferencesList(missingReferences);
}
this.logger.info('💡 Run with --fix flag to automatically create missing cross-references');
if (this.publish_id) {
this.logger.info(`🎯 Currently filtering by publish_id: ${this.publish_id}`);
}
}
} catch (error) {
this.logger.error('Error detecting missing cross-references:', error);
process.exit(1);
}
}
private async storeMissingCrossReferencesCount(count: number): Promise<void> {
try {
await AppConfig.updateOrCreate(
{
appid: 'commands',
configkey: 'missing_cross_references_count',
},
{
configvalue: count.toString(),
},
);
this.logger.info(`📊 Stored missing cross-references count in database: ${count}`);
} catch (error) {
this.logger.error('Failed to store missing cross-references count:', error);
}
}
private async findMissingCrossReferences(): Promise<MissingCrossReference[]> {
const missingReferences: {
sourceDatasetId: number;
targetDatasetId: number;
sourcePublishId: number | null;
targetPublishId: number | null;
sourceDoi: string | null;
targetDoi: string | null;
referenceType: string;
relation: string;
doi: string | null;
reverseRelation: string;
sourceReferenceLabel: string | null;
}[] = [];
this.logger.info('📊 Querying dataset references...');
// Find all references that point to Tethys datasets (DOI or URL containing tethys DOI)
// Only from datasets that are published AND only for allowed relations
const tethysReferencesQuery = DatasetReference.query()
.whereIn('type', ['DOI', 'URL'])
.whereIn('relation', this.ALLOWED_RELATIONS) // Only process allowed relations
.where((query) => {
query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
})
.preload('dataset', (datasetQuery) => {
datasetQuery.preload('identifier');
})
.whereHas('dataset', (datasetQuery) => {
datasetQuery.where('server_state', 'published');
});
if (typeof this.publish_id === 'number') {
tethysReferencesQuery.whereHas('dataset', (datasetQuery) => {
datasetQuery.where('publish_id', this.publish_id as number);
});
}
const tethysReferences = await tethysReferencesQuery.exec();
this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets (allowed relations only)`);
let processedCount = 0;
let skippedCount = 0;
for (const reference of tethysReferences) {
processedCount++;
// if (this.verbose && processedCount % 10 === 0) {
// this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`);
// }
// Double-check that this relation is in our allowed list (safety check)
if (!this.ALLOWED_RELATIONS.includes(reference.relation)) {
skippedCount++;
if (this.verbose) {
this.logger.info(`⏭️ Skipping relation "${reference.relation}" - not in allowed list`);
}
continue;
}
// Extract dataset publish_id from DOI or URL
// const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value);
// Extract DOI from reference URL
const doi = this.extractDoiFromReference(reference.value);
// if (!targetDatasetPublish) {
// if (this.verbose) {
// this.logger.warning(`Could not extract publish ID from: ${reference.value}`);
// }
// continue;
// }
if (!doi) {
if (this.verbose) {
this.logger.warning(`Could not extract DOI from: ${reference.value}`);
}
continue;
}
// // Check if target dataset exists and is published
// const targetDataset = await Dataset.query()
// .where('publish_id', targetDatasetPublish)
// .where('server_state', 'published')
// .preload('identifier')
// .first();
// Check if target dataset exists and is published by querying via identifier
const targetDataset = await Dataset.query()
.where('server_state', 'published')
.whereHas('identifier', (query) => {
query.where('value', doi);
})
.preload('identifier')
.first();
if (!targetDataset) {
if (this.verbose) {
this.logger.warning(`⚠️ Target dataset with publish_id ${doi} not found or not published`);
}
continue;
}
// Ensure we have a valid source dataset with proper preloading
if (!reference.dataset) {
this.logger.warning(`⚠️ Source dataset ${reference.document_id} not properly loaded, skipping...`);
continue;
}
// Check if reverse reference exists
const reverseReferenceExists = await this.checkReverseReferenceExists(
targetDataset.id,
reference.document_id,
reference.relation,
reference.dataset.identifier.value
);
if (!reverseReferenceExists) {
const reverseRelation = this.getReverseRelation(reference.relation);
if (reverseRelation) {
// Only add if we have a valid reverse relation
missingReferences.push({
sourceDatasetId: reference.document_id,
targetDatasetId: targetDataset.id,
sourcePublishId: reference.dataset.publish_id || null,
targetPublishId: targetDataset.publish_id || null,
referenceType: reference.type,
relation: reference.relation,
doi: reference.value,
reverseRelation: reverseRelation,
sourceDoi: reference.dataset.identifier ? reference.dataset.identifier.value : null,
targetDoi: targetDataset.identifier ? targetDataset.identifier.value : null,
sourceReferenceLabel: reference.label || null,
});
}
}
}
this.logger.info(`✅ Processed ${processedCount} references (${skippedCount} skipped due to relation filtering)`);
return missingReferences;
}
private extractDoiFromReference(reference: string): string | null {
// Match DOI pattern, with or without URL prefix
const doiPattern = /(?:https?:\/\/)?(?:doi\.org\/)?(.+)/i;
const match = reference.match(doiPattern);
if (match && match[1]) {
return match[1]; // Returns just "10.24341/tethys.99.2"
}
return null;
}
private extractDatasetPublishIdFromReference(value: string): number | null {
// Extract from DOI: https://doi.org/10.24341/tethys.107 -> 107
const doiMatch = value.match(/10\.24341\/tethys\.(\d+)/);
if (doiMatch) {
return parseInt(doiMatch[1]);
}
// Extract from URL: https://tethys.at/dataset/107 -> 107
const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/);
if (urlMatch) {
return parseInt(urlMatch[1]);
}
return null;
}
private async checkReverseReferenceExists(
targetDatasetId: number,
sourceDatasetId: number,
originalRelation: string,
sourceDatasetIdentifier: string | null,
): Promise<boolean> {
const reverseRelation = this.getReverseRelation(originalRelation);
if (!reverseRelation) {
return true; // If no reverse relation is defined, consider it as "exists" to skip processing
}
// Only check for reverse references where the source dataset is also published
const reverseReference = await DatasetReference.query()
// We don't filter by source document_id here to find any incoming reference from any published dataset
.where('document_id', targetDatasetId)
// .where('related_document_id', sourceDatasetId) // Ensure it's an incoming reference
.where('relation', reverseRelation)
.where('value', 'like', `%${sourceDatasetIdentifier}`) // Basic check to ensure it points back to source dataset
.first();
return !!reverseReference;
}
private getReverseRelation(relation: string): string | null {
const relationMap: Record<string, string> = {
IsNewVersionOf: 'IsPreviousVersionOf',
IsPreviousVersionOf: 'IsNewVersionOf',
IsVariantFormOf: 'IsOriginalFormOf',
IsOriginalFormOf: 'IsVariantFormOf',
Continues: 'IsContinuedBy',
IsContinuedBy: 'Continues',
HasPart: 'IsPartOf',
IsPartOf: 'HasPart',
};
// Only return reverse relation if it exists in our map, otherwise return null
return relationMap[relation] || null;
}
private printMissingReferencesList(missingReferences: MissingCrossReference[]) {
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
console.log('│ MISSING CROSS-REFERENCES REPORT │');
console.log('│ (Published Datasets Only - Filtered Relations) │');
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
console.log();
missingReferences.forEach((missing, index) => {
console.log(
`${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId} Identifier: ${missing.sourceDoi})
${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId} Identifier: ${missing.targetDoi})`,
);
console.log(` ├─ Current relation: "${missing.relation}"`);
console.log(` ├─ Missing reverse relation: "${missing.reverseRelation}"`);
console.log(` ├─ Reference type: ${missing.referenceType}`);
console.log(` └─ DOI/URL: ${missing.doi}`);
console.log();
});
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected │`);
console.log(`│ Processed relations: ${this.ALLOWED_RELATIONS.join(', ')}`);
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
}
private async fixMissingReferences(missingReferences: MissingCrossReference[]) {
this.logger.info('🔧 Creating missing cross-references in database...');
let fixedCount = 0;
let errorCount = 0;
for (const [index, missing] of missingReferences.entries()) {
try {
// Get both source and target datasets
const sourceDataset = await Dataset.query()
.where('id', missing.sourceDatasetId)
.where('server_state', 'published')
.preload('identifier')
.preload('titles') // Preload titles to get mainTitle
.first();
const targetDataset = await Dataset.query().where('id', missing.targetDatasetId).where('server_state', 'published').first();
if (!sourceDataset) {
this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`);
errorCount++;
continue;
}
if (!targetDataset) {
this.logger.warning(`⚠️ Target dataset ${missing.targetDatasetId} not found or not published, skipping...`);
errorCount++;
continue;
}
// **NEW: Update the original reference if related_document_id is missing**
const originalReference = await DatasetReference.query()
.where('document_id', missing.sourceDatasetId)
.where('relation', missing.relation)
.where('value', 'like', `%${missing.targetDoi}%`)
.first();
if (originalReference && !originalReference.related_document_id) {
originalReference.related_document_id = missing.targetDatasetId;
await originalReference.save();
if (this.verbose) {
this.logger.info(`🔗 Updated original reference with related_document_id: ${missing.targetDatasetId}`);
}
}
// Create the reverse reference using the referenced_by relationship
// Example: If Dataset 297 IsNewVersionOf Dataset 144
// We create an incoming reference for Dataset 144 that shows Dataset 297 IsPreviousVersionOf it
const reverseReference = new DatasetReference();
// Don't set document_id - this creates an incoming reference via related_document_id
reverseReference.document_id = missing.targetDatasetId; //
reverseReference.related_document_id = missing.sourceDatasetId;
reverseReference.type = 'DOI';
reverseReference.relation = missing.reverseRelation;
// Use the source dataset's DOI for the value (what's being referenced)
if (sourceDataset.identifier?.value) {
reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`;
} else {
// Fallback to dataset URL if no DOI
reverseReference.value = `https://tethys.at/dataset/${sourceDataset.publish_id || missing.sourceDatasetId}`;
}
// Use the source dataset's main title for the label
//reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`;
// get label of forward reference
reverseReference.label = missing.sourceReferenceLabel || sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`;
// reverseReference.notes = `Auto-created by detect:missing-cross-references command on ${DateTime.now().toISO()} to fix missing bidirectional reference.`;
// Save the new reverse reference
// Also save 'server_date_modified' on target dataset to trigger any downstream updates (e.g. search index)
targetDataset.server_date_modified = DateTime.now();
await targetDataset.save();
await reverseReference.save();
fixedCount++;
if (this.verbose) {
this.logger.info(
`✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.sourceDatasetId} -> ${missing.targetDatasetId} (${missing.reverseRelation})`,
);
} else if ((index + 1) % 10 === 0) {
this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`);
}
} catch (error) {
this.logger.error(
`❌ Error creating reverse reference for datasets ${missing.targetDatasetId} -> ${missing.sourceDatasetId}:`,
error,
);
errorCount++;
}
}
this.logger.info(`📊 Fix completed: ${fixedCount} created, ${errorCount} errors`);
}
}