tethys.backend/commands/fix_version_related_ids.ts
Arno Kaimbacher 9c0221ce27 fix: repair related_document_id for version references
Add `fix:version-related-ids` ace command to backfill and correct
related_document_id on IsNewVersionOf / IsPreviousVersionOf references,
resolving the target dataset via its DOI. Handles both NULL and
self-referential (wrong) values that the existing detect command could
not repair.

Make the dataset version-chain API DOI-based: resolve previous/newer
versions through the DOI in the reference value instead of the
unreliable related_document_id, so the chain is correct regardless of
the stored FK.
2026-06-09 14:23:06 +02:00

189 lines
No EOL
8 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
|--------------------------------------------------------------------------
| node ace make:command fix-version-related-ids
| DONE: create commands/fix_version_related_ids.ts
|--------------------------------------------------------------------------
| Repairs the `related_document_id` foreign key on version references
| (IsNewVersionOf / IsPreviousVersionOf, both directions).
|
| The DOI stored in `value` is the reliable link; `related_document_id`
| is frequently NULL or self-referential. This command resolves the target
| dataset via its DOI and sets `related_document_id` accordingly, correcting
| both NULL and wrong-but-non-null values.
|
| Examples:
| node ace fix:version-related-ids // dry run, all datasets
| node ace fix:version-related-ids --verbose // dry run with per-row detail
| node ace fix:version-related-ids --fix // apply changes
| node ace fix:version-related-ids --fix -p 226 // apply, only refs owned by publish_id 226
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import type { CommandOptions } from '@adonisjs/core/types/ace';
import Dataset from '#models/dataset';
import DatasetReference from '#models/dataset_reference';
export default class FixVersionRelatedIds extends BaseCommand {
static commandName = 'fix:version-related-ids';
static description =
'Backfill/repair related_document_id on IsNewVersionOf / IsPreviousVersionOf references by resolving the target dataset via its DOI';
public static needsApplication = true;
@flags.boolean({ alias: 'f', description: 'Apply changes. Without this flag the command runs as a dry run.' })
public fix: boolean = false;
@flags.boolean({ alias: 'v', description: 'Verbose output (per-reference detail)' })
public verbose: boolean = false;
@flags.number({ alias: 'p', description: 'Only process references owned by this publish_id' })
public publish_id?: number;
public static options: CommandOptions = {
startApp: true,
staysAlive: false,
};
// Only the version relations, both directions.
private readonly VERSION_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf'];
async run() {
this.logger.info(`🔍 Scanning ${this.VERSION_RELATIONS.join(' / ')} references...`);
this.logger.info(this.fix ? '✏️ Mode: APPLY (changes will be written)' : '👀 Mode: DRY RUN (no changes written)');
if (typeof this.publish_id === 'number') {
this.logger.info(`🎯 Filtering by owning publish_id: ${this.publish_id}`);
}
try {
const query = DatasetReference.query()
.whereIn('relation', this.VERSION_RELATIONS)
.whereIn('type', ['DOI', 'URL'])
.where((q) => {
q.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
});
// Restrict to references owned by a specific dataset (by publish_id), if requested.
if (typeof this.publish_id === 'number') {
query.whereHas('dataset', (d) => d.where('publish_id', this.publish_id as number));
}
const refs = await query.exec();
this.logger.info(`🔗 Found ${refs.length} version reference(s) to inspect`);
let alreadyCorrect = 0;
let filledFromNull = 0;
let correctedWrong = 0;
let unresolved = 0;
for (const ref of refs) {
const target = await this.resolveTarget(ref);
if (!target) {
unresolved++;
if (this.verbose) {
this.logger.warning(`⚠️ Reference ${ref.id}: could not resolve target (value: ${ref.value})`);
}
continue;
}
// Never let a reference point at its own owning document.
if (target.id === ref.document_id) {
unresolved++;
if (this.verbose) {
this.logger.warning(
`⚠️ Reference ${ref.id}: target resolves to its own document (${ref.document_id}); skipping self-link`,
);
}
continue;
}
if (ref.related_document_id === target.id) {
alreadyCorrect++;
continue;
}
const previous = ref.related_document_id;
const wasNull = previous === null || previous === undefined;
if (this.fix) {
ref.related_document_id = target.id;
await ref.save();
}
if (wasNull) {
filledFromNull++;
} else {
correctedWrong++;
}
if (this.verbose) {
const action = this.fix ? 'Updated' : '📝 Would update';
this.logger.info(
`${action} reference ${ref.id} (doc ${ref.document_id}, ${ref.relation}): ` +
`related_document_id ${previous ?? 'NULL'}${target.id} (publish_id ${target.publish_id})`,
);
}
}
this.logger.info('────────────────────────────────────────');
this.logger.info(`✔️ Already correct: ${alreadyCorrect}`);
this.logger.info(` Filled from NULL: ${filledFromNull}`);
this.logger.info(`🔧 Corrected wrong value: ${correctedWrong}`);
this.logger.info(`⚠️ Unresolved/skipped: ${unresolved}`);
this.logger.info('────────────────────────────────────────');
const changes = filledFromNull + correctedWrong;
if (!this.fix && changes > 0) {
this.logger.info(`💡 Dry run only. Re-run with --fix to write ${changes} change(s).`);
} else if (this.fix) {
this.logger.success(`Done. ${changes} reference(s) updated.`);
} else {
this.logger.success('Nothing to change — all version references already linked correctly.');
}
} catch (error) {
this.logger.error('Error fixing version related_document_id values:', error);
process.exit(1);
}
}
/**
* Resolve the dataset a version reference points to.
* Prefers the DOI in `value` (reliable); falls back to a tethys publish_id URL.
*/
private async resolveTarget(ref: DatasetReference): Promise<Dataset | null> {
const doi = this.normalizeDoi(ref.value);
if (doi) {
const byDoi = await Dataset.query()
.whereHas('identifier', (q) => q.where('value', doi))
.first();
if (byDoi) return byDoi;
}
const publishId = this.extractPublishId(ref.value);
if (publishId) {
const byPublishId = await Dataset.query().where('publish_id', publishId).first();
if (byPublishId) return byPublishId;
}
return null;
}
/**
* Strip the resolver prefix so a reference value like
* "https://doi.org/10.24341/tethys.108.2" matches the identifier
* table value "10.24341/tethys.108.2". Returns null if it isn't a DOI.
*/
private normalizeDoi(value: string | null): string | null {
if (!value) return null;
const cleaned = value
.trim()
.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '')
.replace(/^doi:/i, '');
return /^10\.\d{4,}\//.test(cleaned) ? cleaned : null;
}
private extractPublishId(value: string | null): number | null {
if (!value) return null;
const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/);
return urlMatch ? parseInt(urlMatch[1], 10) : null;
}
}