fix: repair related_document_id for version references
Add `fix:version-related-ids` ace command to backfill and correct related_document_id on IsNewVersionOf / IsPreviousVersionOf references, resolving the target dataset via its DOI. Handles both NULL and self-referential (wrong) values that the existing detect command could not repair. Make the dataset version-chain API DOI-based: resolve previous/newer versions through the DOI in the reference value instead of the unreliable related_document_id, so the chain is correct regardless of the stored FK.
This commit is contained in:
parent
9368a0dd8d
commit
9c0221ce27
2 changed files with 347 additions and 69 deletions
189
commands/fix_version_related_ids.ts
Normal file
189
commands/fix_version_related_ids.ts
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
/*
|
||||
|--------------------------------------------------------------------------
|
||||
| node ace make:command fix-version-related-ids
|
||||
| DONE: create commands/fix_version_related_ids.ts
|
||||
|--------------------------------------------------------------------------
|
||||
| Repairs the `related_document_id` foreign key on version references
|
||||
| (IsNewVersionOf / IsPreviousVersionOf, both directions).
|
||||
|
|
||||
| The DOI stored in `value` is the reliable link; `related_document_id`
|
||||
| is frequently NULL or self-referential. This command resolves the target
|
||||
| dataset via its DOI and sets `related_document_id` accordingly, correcting
|
||||
| both NULL and wrong-but-non-null values.
|
||||
|
|
||||
| Examples:
|
||||
| node ace fix:version-related-ids // dry run, all datasets
|
||||
| node ace fix:version-related-ids --verbose // dry run with per-row detail
|
||||
| node ace fix:version-related-ids --fix // apply changes
|
||||
| node ace fix:version-related-ids --fix -p 226 // apply, only refs owned by publish_id 226
|
||||
*/
|
||||
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
||||
import type { CommandOptions } from '@adonisjs/core/types/ace';
|
||||
import Dataset from '#models/dataset';
|
||||
import DatasetReference from '#models/dataset_reference';
|
||||
|
||||
export default class FixVersionRelatedIds extends BaseCommand {
|
||||
static commandName = 'fix:version-related-ids';
|
||||
static description =
|
||||
'Backfill/repair related_document_id on IsNewVersionOf / IsPreviousVersionOf references by resolving the target dataset via its DOI';
|
||||
|
||||
public static needsApplication = true;
|
||||
|
||||
@flags.boolean({ alias: 'f', description: 'Apply changes. Without this flag the command runs as a dry run.' })
|
||||
public fix: boolean = false;
|
||||
|
||||
@flags.boolean({ alias: 'v', description: 'Verbose output (per-reference detail)' })
|
||||
public verbose: boolean = false;
|
||||
|
||||
@flags.number({ alias: 'p', description: 'Only process references owned by this publish_id' })
|
||||
public publish_id?: number;
|
||||
|
||||
public static options: CommandOptions = {
|
||||
startApp: true,
|
||||
staysAlive: false,
|
||||
};
|
||||
|
||||
// Only the version relations, both directions.
|
||||
private readonly VERSION_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf'];
|
||||
|
||||
async run() {
|
||||
this.logger.info(`🔍 Scanning ${this.VERSION_RELATIONS.join(' / ')} references...`);
|
||||
this.logger.info(this.fix ? '✏️ Mode: APPLY (changes will be written)' : '👀 Mode: DRY RUN (no changes written)');
|
||||
if (typeof this.publish_id === 'number') {
|
||||
this.logger.info(`🎯 Filtering by owning publish_id: ${this.publish_id}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const query = DatasetReference.query()
|
||||
.whereIn('relation', this.VERSION_RELATIONS)
|
||||
.whereIn('type', ['DOI', 'URL'])
|
||||
.where((q) => {
|
||||
q.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
|
||||
});
|
||||
|
||||
// Restrict to references owned by a specific dataset (by publish_id), if requested.
|
||||
if (typeof this.publish_id === 'number') {
|
||||
query.whereHas('dataset', (d) => d.where('publish_id', this.publish_id as number));
|
||||
}
|
||||
|
||||
const refs = await query.exec();
|
||||
this.logger.info(`🔗 Found ${refs.length} version reference(s) to inspect`);
|
||||
|
||||
let alreadyCorrect = 0;
|
||||
let filledFromNull = 0;
|
||||
let correctedWrong = 0;
|
||||
let unresolved = 0;
|
||||
|
||||
for (const ref of refs) {
|
||||
const target = await this.resolveTarget(ref);
|
||||
|
||||
if (!target) {
|
||||
unresolved++;
|
||||
if (this.verbose) {
|
||||
this.logger.warning(`⚠️ Reference ${ref.id}: could not resolve target (value: ${ref.value})`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Never let a reference point at its own owning document.
|
||||
if (target.id === ref.document_id) {
|
||||
unresolved++;
|
||||
if (this.verbose) {
|
||||
this.logger.warning(
|
||||
`⚠️ Reference ${ref.id}: target resolves to its own document (${ref.document_id}); skipping self-link`,
|
||||
);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ref.related_document_id === target.id) {
|
||||
alreadyCorrect++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const previous = ref.related_document_id;
|
||||
const wasNull = previous === null || previous === undefined;
|
||||
|
||||
if (this.fix) {
|
||||
ref.related_document_id = target.id;
|
||||
await ref.save();
|
||||
}
|
||||
|
||||
if (wasNull) {
|
||||
filledFromNull++;
|
||||
} else {
|
||||
correctedWrong++;
|
||||
}
|
||||
|
||||
if (this.verbose) {
|
||||
const action = this.fix ? 'Updated' : '📝 Would update';
|
||||
this.logger.info(
|
||||
`${action} reference ${ref.id} (doc ${ref.document_id}, ${ref.relation}): ` +
|
||||
`related_document_id ${previous ?? 'NULL'} → ${target.id} (publish_id ${target.publish_id})`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.info('────────────────────────────────────────');
|
||||
this.logger.info(`✔️ Already correct: ${alreadyCorrect}`);
|
||||
this.logger.info(`➕ Filled from NULL: ${filledFromNull}`);
|
||||
this.logger.info(`🔧 Corrected wrong value: ${correctedWrong}`);
|
||||
this.logger.info(`⚠️ Unresolved/skipped: ${unresolved}`);
|
||||
this.logger.info('────────────────────────────────────────');
|
||||
|
||||
const changes = filledFromNull + correctedWrong;
|
||||
if (!this.fix && changes > 0) {
|
||||
this.logger.info(`💡 Dry run only. Re-run with --fix to write ${changes} change(s).`);
|
||||
} else if (this.fix) {
|
||||
this.logger.success(`Done. ${changes} reference(s) updated.`);
|
||||
} else {
|
||||
this.logger.success('Nothing to change — all version references already linked correctly.');
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error('Error fixing version related_document_id values:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the dataset a version reference points to.
|
||||
* Prefers the DOI in `value` (reliable); falls back to a tethys publish_id URL.
|
||||
*/
|
||||
private async resolveTarget(ref: DatasetReference): Promise<Dataset | null> {
|
||||
const doi = this.normalizeDoi(ref.value);
|
||||
if (doi) {
|
||||
const byDoi = await Dataset.query()
|
||||
.whereHas('identifier', (q) => q.where('value', doi))
|
||||
.first();
|
||||
if (byDoi) return byDoi;
|
||||
}
|
||||
|
||||
const publishId = this.extractPublishId(ref.value);
|
||||
if (publishId) {
|
||||
const byPublishId = await Dataset.query().where('publish_id', publishId).first();
|
||||
if (byPublishId) return byPublishId;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip the resolver prefix so a reference value like
|
||||
* "https://doi.org/10.24341/tethys.108.2" matches the identifier
|
||||
* table value "10.24341/tethys.108.2". Returns null if it isn't a DOI.
|
||||
*/
|
||||
private normalizeDoi(value: string | null): string | null {
|
||||
if (!value) return null;
|
||||
const cleaned = value
|
||||
.trim()
|
||||
.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '')
|
||||
.replace(/^doi:/i, '');
|
||||
return /^10\.\d{4,}\//.test(cleaned) ? cleaned : null;
|
||||
}
|
||||
|
||||
private extractPublishId(value: string | null): number | null {
|
||||
if (!value) return null;
|
||||
const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/);
|
||||
return urlMatch ? parseInt(urlMatch[1], 10) : null;
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue