fix: repair related_document_id for version references

Add `fix:version-related-ids` ace command to backfill and correct
related_document_id on IsNewVersionOf / IsPreviousVersionOf references,
resolving the target dataset via its DOI. Handles both NULL and
self-referential (wrong) values that the existing detect command could
not repair.

Make the dataset version-chain API DOI-based: resolve previous/newer
versions through the DOI in the reference value instead of the
unreliable related_document_id, so the chain is correct regardless of
the stored FK.
This commit is contained in:
Kaimbacher 2026-06-09 14:23:06 +02:00
commit 9c0221ce27
2 changed files with 347 additions and 69 deletions

View file

@ -210,13 +210,13 @@ export default class DatasetController {
*/ */
private async buildVersionChain(dataset: Dataset) { private async buildVersionChain(dataset: Dataset) {
const versionChain = { const versionChain = {
current: { // current: {
id: dataset.id, // id: dataset.id,
publish_id: dataset.publish_id, // publish_id: dataset.publish_id,
doi: dataset.identifier?.value || null, // doi: dataset.identifier?.value || null,
main_title: dataset.mainTitle || null, // main_title: dataset.mainTitle || null,
server_date_published: dataset.server_date_published, // server_date_published: dataset.server_date_published,
}, // },
previousVersions: [] as any[], previousVersions: [] as any[],
newerVersions: [] as any[], newerVersions: [] as any[],
}; };
@ -233,92 +233,181 @@ export default class DatasetController {
/** /**
* Recursively get all previous versions * Recursively get all previous versions
*/ */
// private async getPreviousVersions(datasetId: number, visited: Set<number> = new Set()): Promise<any[]> {
// // Prevent infinite loops
// if (visited.has(datasetId)) {
// return [];
// }
// visited.add(datasetId);
// const previousVersions: any[] = [];
// // Find references where this dataset "IsNewVersionOf" another dataset
// const previousRefs = await DatasetReference.query()
// .where('document_id', datasetId)
// .where('relation', 'IsNewVersionOf')
// .whereNotNull('related_document_id');
// for (const ref of previousRefs) {
// if (!ref.related_document_id) continue;
// const previousDataset = await Dataset.query()
// .where('id', ref.related_document_id)
// .preload('identifier')
// .preload('titles')
// .first();
// if (previousDataset) {
// const versionInfo = {
// id: previousDataset.id,
// publish_id: previousDataset.publish_id,
// doi: previousDataset.identifier?.value || null,
// main_title: previousDataset.mainTitle || null,
// server_date_published: previousDataset.server_date_published,
// relation: 'IsPreviousVersionOf', // From perspective of current dataset
// };
// previousVersions.push(versionInfo);
// // Recursively get even older versions
// const olderVersions = await this.getPreviousVersions(previousDataset.id, visited);
// previousVersions.push(...olderVersions);
// }
// }
// return previousVersions;
// }
private async getPreviousVersions(datasetId: number, visited: Set<number> = new Set()): Promise<any[]> { private async getPreviousVersions(datasetId: number, visited: Set<number> = new Set()): Promise<any[]> {
// Prevent infinite loops if (visited.has(datasetId)) return [];
if (visited.has(datasetId)) {
return [];
}
visited.add(datasetId); visited.add(datasetId);
const previousVersions: any[] = []; const result: any[] = [];
// Find references where this dataset "IsNewVersionOf" another dataset // A dataset points to its OLDER version via relation 'IsNewVersionOf'
const previousRefs = await DatasetReference.query() const refs = await DatasetReference.query()
.where('document_id', datasetId) .where('document_id', datasetId)
.where('relation', 'IsNewVersionOf') .where('relation', 'IsNewVersionOf'); // ← removed .whereNotNull('related_document_id')
.whereNotNull('related_document_id');
for (const ref of previousRefs) { for (const ref of refs) {
if (!ref.related_document_id) continue; const related = await this.resolveReferencedDataset(ref, datasetId);
if (!related) continue;
const previousDataset = await Dataset.query() result.push({
.where('id', ref.related_document_id) id: related.id,
.preload('identifier') publish_id: related.publish_id,
.preload('titles') doi: related.identifier?.value || null,
.first(); main_title: related.mainTitle || null,
server_date_published: related.server_date_published,
relation: 'IsPreviousVersionOf',
});
if (previousDataset) { result.push(...(await this.getPreviousVersions(related.id, visited)));
const versionInfo = {
id: previousDataset.id,
publish_id: previousDataset.publish_id,
doi: previousDataset.identifier?.value || null,
main_title: previousDataset.mainTitle || null,
server_date_published: previousDataset.server_date_published,
relation: 'IsPreviousVersionOf', // From perspective of current dataset
};
previousVersions.push(versionInfo);
// Recursively get even older versions
const olderVersions = await this.getPreviousVersions(previousDataset.id, visited);
previousVersions.push(...olderVersions);
}
} }
return previousVersions; return result;
} }
/** /**
* Recursively get all newer versions * Recursively get all newer versions
*/ */
// private async getNewerVersions(datasetId: number, visited: Set<number> = new Set()): Promise<any[]> {
// // Prevent infinite loops
// if (visited.has(datasetId)) {
// return [];
// }
// visited.add(datasetId);
// const newerVersions: any[] = [];
// // Find references where this dataset "IsPreviousVersionOf" another dataset
// const newerRefs = await DatasetReference.query()
// .where('document_id', datasetId)
// .where('relation', 'IsPreviousVersionOf')
// .whereNotNull('related_document_id');
// for (const ref of newerRefs) {
// if (!ref.related_document_id) continue;
// const newerDataset = await Dataset.query().where('id', ref.related_document_id).preload('identifier').preload('titles').first();
// if (newerDataset) {
// const versionInfo = {
// id: newerDataset.id,
// publish_id: newerDataset.publish_id,
// doi: newerDataset.identifier?.value || null,
// main_title: newerDataset.mainTitle || null,
// server_date_published: newerDataset.server_date_published,
// relation: 'IsNewVersionOf', // From perspective of current dataset
// };
// newerVersions.push(versionInfo);
// // Recursively get even newer versions
// const evenNewerVersions = await this.getNewerVersions(newerDataset.id, visited);
// newerVersions.push(...evenNewerVersions);
// }
// }
// return newerVersions;
// }
private async getNewerVersions(datasetId: number, visited: Set<number> = new Set()): Promise<any[]> { private async getNewerVersions(datasetId: number, visited: Set<number> = new Set()): Promise<any[]> {
// Prevent infinite loops if (visited.has(datasetId)) return [];
if (visited.has(datasetId)) {
return [];
}
visited.add(datasetId); visited.add(datasetId);
const newerVersions: any[] = []; const result: any[] = [];
// Find references where this dataset "IsPreviousVersionOf" another dataset // A dataset points to its NEWER version via relation 'IsPreviousVersionOf'
const newerRefs = await DatasetReference.query() const refs = await DatasetReference.query()
.where('document_id', datasetId) .where('document_id', datasetId)
.where('relation', 'IsPreviousVersionOf') .where('relation', 'IsPreviousVersionOf'); // ← removed .whereNotNull(...)
.whereNotNull('related_document_id');
for (const ref of newerRefs) { for (const ref of refs) {
if (!ref.related_document_id) continue; const related = await this.resolveReferencedDataset(ref, datasetId);
if (!related) continue;
const newerDataset = await Dataset.query().where('id', ref.related_document_id).preload('identifier').preload('titles').first(); result.push({
id: related.id,
publish_id: related.publish_id,
doi: related.identifier?.value || null,
main_title: related.mainTitle || null,
server_date_published: related.server_date_published,
relation: 'IsNewVersionOf',
});
if (newerDataset) { result.push(...(await this.getNewerVersions(related.id, visited)));
const versionInfo = {
id: newerDataset.id,
publish_id: newerDataset.publish_id,
doi: newerDataset.identifier?.value || null,
main_title: newerDataset.mainTitle || null,
server_date_published: newerDataset.server_date_published,
relation: 'IsNewVersionOf', // From perspective of current dataset
};
newerVersions.push(versionInfo);
// Recursively get even newer versions
const evenNewerVersions = await this.getNewerVersions(newerDataset.id, visited);
newerVersions.push(...evenNewerVersions);
}
} }
return newerVersions; return result;
}
private async resolveReferencedDataset(ref: DatasetReference, currentDatasetId: number) {
const doi = this.normalizeDoi(ref.value);
if (doi) {
const byDoi = await Dataset.query()
.whereHas('identifier', (q) => q.where('value', doi))
.preload('identifier')
.preload('titles') // needed so mainTitle computes
.first();
if (byDoi) return byDoi;
}
if (ref.related_document_id && ref.related_document_id !== currentDatasetId) {
return await Dataset.query()
.where('id', ref.related_document_id)
.preload('identifier')
.preload('titles')
.first();
}
return null;
}
private normalizeDoi(value: string | null): string | null {
if (!value) return null;
return value
.trim()
.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '')
.replace(/^doi:/i, '');
} }
} }

View file

@ -0,0 +1,189 @@
/*
|--------------------------------------------------------------------------
| node ace make:command fix-version-related-ids
| DONE: create commands/fix_version_related_ids.ts
|--------------------------------------------------------------------------
| Repairs the `related_document_id` foreign key on version references
| (IsNewVersionOf / IsPreviousVersionOf, both directions).
|
| The DOI stored in `value` is the reliable link; `related_document_id`
| is frequently NULL or self-referential. This command resolves the target
| dataset via its DOI and sets `related_document_id` accordingly, correcting
| both NULL and wrong-but-non-null values.
|
| Examples:
| node ace fix:version-related-ids // dry run, all datasets
| node ace fix:version-related-ids --verbose // dry run with per-row detail
| node ace fix:version-related-ids --fix // apply changes
| node ace fix:version-related-ids --fix -p 226 // apply, only refs owned by publish_id 226
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import type { CommandOptions } from '@adonisjs/core/types/ace';
import Dataset from '#models/dataset';
import DatasetReference from '#models/dataset_reference';
export default class FixVersionRelatedIds extends BaseCommand {
static commandName = 'fix:version-related-ids';
static description =
'Backfill/repair related_document_id on IsNewVersionOf / IsPreviousVersionOf references by resolving the target dataset via its DOI';
public static needsApplication = true;
@flags.boolean({ alias: 'f', description: 'Apply changes. Without this flag the command runs as a dry run.' })
public fix: boolean = false;
@flags.boolean({ alias: 'v', description: 'Verbose output (per-reference detail)' })
public verbose: boolean = false;
@flags.number({ alias: 'p', description: 'Only process references owned by this publish_id' })
public publish_id?: number;
public static options: CommandOptions = {
startApp: true,
staysAlive: false,
};
// Only the version relations, both directions.
private readonly VERSION_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf'];
async run() {
this.logger.info(`🔍 Scanning ${this.VERSION_RELATIONS.join(' / ')} references...`);
this.logger.info(this.fix ? '✏️ Mode: APPLY (changes will be written)' : '👀 Mode: DRY RUN (no changes written)');
if (typeof this.publish_id === 'number') {
this.logger.info(`🎯 Filtering by owning publish_id: ${this.publish_id}`);
}
try {
const query = DatasetReference.query()
.whereIn('relation', this.VERSION_RELATIONS)
.whereIn('type', ['DOI', 'URL'])
.where((q) => {
q.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
});
// Restrict to references owned by a specific dataset (by publish_id), if requested.
if (typeof this.publish_id === 'number') {
query.whereHas('dataset', (d) => d.where('publish_id', this.publish_id as number));
}
const refs = await query.exec();
this.logger.info(`🔗 Found ${refs.length} version reference(s) to inspect`);
let alreadyCorrect = 0;
let filledFromNull = 0;
let correctedWrong = 0;
let unresolved = 0;
for (const ref of refs) {
const target = await this.resolveTarget(ref);
if (!target) {
unresolved++;
if (this.verbose) {
this.logger.warning(`⚠️ Reference ${ref.id}: could not resolve target (value: ${ref.value})`);
}
continue;
}
// Never let a reference point at its own owning document.
if (target.id === ref.document_id) {
unresolved++;
if (this.verbose) {
this.logger.warning(
`⚠️ Reference ${ref.id}: target resolves to its own document (${ref.document_id}); skipping self-link`,
);
}
continue;
}
if (ref.related_document_id === target.id) {
alreadyCorrect++;
continue;
}
const previous = ref.related_document_id;
const wasNull = previous === null || previous === undefined;
if (this.fix) {
ref.related_document_id = target.id;
await ref.save();
}
if (wasNull) {
filledFromNull++;
} else {
correctedWrong++;
}
if (this.verbose) {
const action = this.fix ? 'Updated' : '📝 Would update';
this.logger.info(
`${action} reference ${ref.id} (doc ${ref.document_id}, ${ref.relation}): ` +
`related_document_id ${previous ?? 'NULL'}${target.id} (publish_id ${target.publish_id})`,
);
}
}
this.logger.info('────────────────────────────────────────');
this.logger.info(`✔️ Already correct: ${alreadyCorrect}`);
this.logger.info(` Filled from NULL: ${filledFromNull}`);
this.logger.info(`🔧 Corrected wrong value: ${correctedWrong}`);
this.logger.info(`⚠️ Unresolved/skipped: ${unresolved}`);
this.logger.info('────────────────────────────────────────');
const changes = filledFromNull + correctedWrong;
if (!this.fix && changes > 0) {
this.logger.info(`💡 Dry run only. Re-run with --fix to write ${changes} change(s).`);
} else if (this.fix) {
this.logger.success(`Done. ${changes} reference(s) updated.`);
} else {
this.logger.success('Nothing to change — all version references already linked correctly.');
}
} catch (error) {
this.logger.error('Error fixing version related_document_id values:', error);
process.exit(1);
}
}
/**
* Resolve the dataset a version reference points to.
* Prefers the DOI in `value` (reliable); falls back to a tethys publish_id URL.
*/
private async resolveTarget(ref: DatasetReference): Promise<Dataset | null> {
const doi = this.normalizeDoi(ref.value);
if (doi) {
const byDoi = await Dataset.query()
.whereHas('identifier', (q) => q.where('value', doi))
.first();
if (byDoi) return byDoi;
}
const publishId = this.extractPublishId(ref.value);
if (publishId) {
const byPublishId = await Dataset.query().where('publish_id', publishId).first();
if (byPublishId) return byPublishId;
}
return null;
}
/**
* Strip the resolver prefix so a reference value like
* "https://doi.org/10.24341/tethys.108.2" matches the identifier
* table value "10.24341/tethys.108.2". Returns null if it isn't a DOI.
*/
private normalizeDoi(value: string | null): string | null {
if (!value) return null;
const cleaned = value
.trim()
.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '')
.replace(/^doi:/i, '');
return /^10\.\d{4,}\//.test(cleaned) ? cleaned : null;
}
private extractPublishId(value: string | null): number | null {
if (!value) return null;
const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/);
return urlMatch ? parseInt(urlMatch[1], 10) : null;
}
}