diff --git a/app/Controllers/Http/Api/DatasetController.ts b/app/Controllers/Http/Api/DatasetController.ts index 9106521..8cb17b6 100644 --- a/app/Controllers/Http/Api/DatasetController.ts +++ b/app/Controllers/Http/Api/DatasetController.ts @@ -210,13 +210,13 @@ export default class DatasetController { */ private async buildVersionChain(dataset: Dataset) { const versionChain = { - current: { - id: dataset.id, - publish_id: dataset.publish_id, - doi: dataset.identifier?.value || null, - main_title: dataset.mainTitle || null, - server_date_published: dataset.server_date_published, - }, + // current: { + // id: dataset.id, + // publish_id: dataset.publish_id, + // doi: dataset.identifier?.value || null, + // main_title: dataset.mainTitle || null, + // server_date_published: dataset.server_date_published, + // }, previousVersions: [] as any[], newerVersions: [] as any[], }; @@ -233,92 +233,181 @@ export default class DatasetController { /** * Recursively get all previous versions */ + // private async getPreviousVersions(datasetId: number, visited: Set = new Set()): Promise { + // // Prevent infinite loops + // if (visited.has(datasetId)) { + // return []; + // } + // visited.add(datasetId); + + // const previousVersions: any[] = []; + + // // Find references where this dataset "IsNewVersionOf" another dataset + // const previousRefs = await DatasetReference.query() + // .where('document_id', datasetId) + // .where('relation', 'IsNewVersionOf') + // .whereNotNull('related_document_id'); + + // for (const ref of previousRefs) { + // if (!ref.related_document_id) continue; + + // const previousDataset = await Dataset.query() + // .where('id', ref.related_document_id) + // .preload('identifier') + // .preload('titles') + // .first(); + + // if (previousDataset) { + // const versionInfo = { + // id: previousDataset.id, + // publish_id: previousDataset.publish_id, + // doi: previousDataset.identifier?.value || null, + // main_title: previousDataset.mainTitle || null, + // server_date_published: previousDataset.server_date_published, + // relation: 'IsPreviousVersionOf', // From perspective of current dataset + // }; + + // previousVersions.push(versionInfo); + + // // Recursively get even older versions + // const olderVersions = await this.getPreviousVersions(previousDataset.id, visited); + // previousVersions.push(...olderVersions); + // } + // } + + // return previousVersions; + // } + private async getPreviousVersions(datasetId: number, visited: Set = new Set()): Promise { - // Prevent infinite loops - if (visited.has(datasetId)) { - return []; - } + if (visited.has(datasetId)) return []; visited.add(datasetId); - const previousVersions: any[] = []; + const result: any[] = []; - // Find references where this dataset "IsNewVersionOf" another dataset - const previousRefs = await DatasetReference.query() + // A dataset points to its OLDER version via relation 'IsNewVersionOf' + const refs = await DatasetReference.query() .where('document_id', datasetId) - .where('relation', 'IsNewVersionOf') - .whereNotNull('related_document_id'); + .where('relation', 'IsNewVersionOf'); // ← removed .whereNotNull('related_document_id') - for (const ref of previousRefs) { - if (!ref.related_document_id) continue; + for (const ref of refs) { + const related = await this.resolveReferencedDataset(ref, datasetId); + if (!related) continue; - const previousDataset = await Dataset.query() - .where('id', ref.related_document_id) - .preload('identifier') - .preload('titles') - .first(); + result.push({ + id: related.id, + publish_id: related.publish_id, + doi: related.identifier?.value || null, + main_title: related.mainTitle || null, + server_date_published: related.server_date_published, + relation: 'IsPreviousVersionOf', + }); - if (previousDataset) { - const versionInfo = { - id: previousDataset.id, - publish_id: previousDataset.publish_id, - doi: previousDataset.identifier?.value || null, - main_title: previousDataset.mainTitle || null, - server_date_published: previousDataset.server_date_published, - relation: 'IsPreviousVersionOf', // From perspective of current dataset - }; - - previousVersions.push(versionInfo); - - // Recursively get even older versions - const olderVersions = await this.getPreviousVersions(previousDataset.id, visited); - previousVersions.push(...olderVersions); - } + result.push(...(await this.getPreviousVersions(related.id, visited))); } - return previousVersions; + return result; } /** * Recursively get all newer versions */ + // private async getNewerVersions(datasetId: number, visited: Set = new Set()): Promise { + // // Prevent infinite loops + // if (visited.has(datasetId)) { + // return []; + // } + // visited.add(datasetId); + + // const newerVersions: any[] = []; + + // // Find references where this dataset "IsPreviousVersionOf" another dataset + // const newerRefs = await DatasetReference.query() + // .where('document_id', datasetId) + // .where('relation', 'IsPreviousVersionOf') + // .whereNotNull('related_document_id'); + + // for (const ref of newerRefs) { + // if (!ref.related_document_id) continue; + + // const newerDataset = await Dataset.query().where('id', ref.related_document_id).preload('identifier').preload('titles').first(); + + // if (newerDataset) { + // const versionInfo = { + // id: newerDataset.id, + // publish_id: newerDataset.publish_id, + // doi: newerDataset.identifier?.value || null, + // main_title: newerDataset.mainTitle || null, + // server_date_published: newerDataset.server_date_published, + // relation: 'IsNewVersionOf', // From perspective of current dataset + // }; + + // newerVersions.push(versionInfo); + + // // Recursively get even newer versions + // const evenNewerVersions = await this.getNewerVersions(newerDataset.id, visited); + // newerVersions.push(...evenNewerVersions); + // } + // } + + // return newerVersions; + // } private async getNewerVersions(datasetId: number, visited: Set = new Set()): Promise { - // Prevent infinite loops - if (visited.has(datasetId)) { - return []; - } + if (visited.has(datasetId)) return []; visited.add(datasetId); - const newerVersions: any[] = []; + const result: any[] = []; - // Find references where this dataset "IsPreviousVersionOf" another dataset - const newerRefs = await DatasetReference.query() + // A dataset points to its NEWER version via relation 'IsPreviousVersionOf' + const refs = await DatasetReference.query() .where('document_id', datasetId) - .where('relation', 'IsPreviousVersionOf') - .whereNotNull('related_document_id'); + .where('relation', 'IsPreviousVersionOf'); // ← removed .whereNotNull(...) - for (const ref of newerRefs) { - if (!ref.related_document_id) continue; + for (const ref of refs) { + const related = await this.resolveReferencedDataset(ref, datasetId); + if (!related) continue; - const newerDataset = await Dataset.query().where('id', ref.related_document_id).preload('identifier').preload('titles').first(); + result.push({ + id: related.id, + publish_id: related.publish_id, + doi: related.identifier?.value || null, + main_title: related.mainTitle || null, + server_date_published: related.server_date_published, + relation: 'IsNewVersionOf', + }); - if (newerDataset) { - const versionInfo = { - id: newerDataset.id, - publish_id: newerDataset.publish_id, - doi: newerDataset.identifier?.value || null, - main_title: newerDataset.mainTitle || null, - server_date_published: newerDataset.server_date_published, - relation: 'IsNewVersionOf', // From perspective of current dataset - }; - - newerVersions.push(versionInfo); - - // Recursively get even newer versions - const evenNewerVersions = await this.getNewerVersions(newerDataset.id, visited); - newerVersions.push(...evenNewerVersions); - } + result.push(...(await this.getNewerVersions(related.id, visited))); } - return newerVersions; + return result; + } + + private async resolveReferencedDataset(ref: DatasetReference, currentDatasetId: number) { + const doi = this.normalizeDoi(ref.value); + + if (doi) { + const byDoi = await Dataset.query() + .whereHas('identifier', (q) => q.where('value', doi)) + .preload('identifier') + .preload('titles') // needed so mainTitle computes + .first(); + if (byDoi) return byDoi; + } + + if (ref.related_document_id && ref.related_document_id !== currentDatasetId) { + return await Dataset.query() + .where('id', ref.related_document_id) + .preload('identifier') + .preload('titles') + .first(); + } + + return null; + } + private normalizeDoi(value: string | null): string | null { + if (!value) return null; + return value + .trim() + .replace(/^https?:\/\/(dx\.)?doi\.org\//i, '') + .replace(/^doi:/i, ''); } } diff --git a/commands/fix_version_related_ids.ts b/commands/fix_version_related_ids.ts new file mode 100644 index 0000000..40afc13 --- /dev/null +++ b/commands/fix_version_related_ids.ts @@ -0,0 +1,189 @@ +/* +|-------------------------------------------------------------------------- +| node ace make:command fix-version-related-ids +| DONE: create commands/fix_version_related_ids.ts +|-------------------------------------------------------------------------- +| Repairs the `related_document_id` foreign key on version references +| (IsNewVersionOf / IsPreviousVersionOf, both directions). +| +| The DOI stored in `value` is the reliable link; `related_document_id` +| is frequently NULL or self-referential. This command resolves the target +| dataset via its DOI and sets `related_document_id` accordingly, correcting +| both NULL and wrong-but-non-null values. +| +| Examples: +| node ace fix:version-related-ids // dry run, all datasets +| node ace fix:version-related-ids --verbose // dry run with per-row detail +| node ace fix:version-related-ids --fix // apply changes +| node ace fix:version-related-ids --fix -p 226 // apply, only refs owned by publish_id 226 +*/ +import { BaseCommand, flags } from '@adonisjs/core/ace'; +import type { CommandOptions } from '@adonisjs/core/types/ace'; +import Dataset from '#models/dataset'; +import DatasetReference from '#models/dataset_reference'; + +export default class FixVersionRelatedIds extends BaseCommand { + static commandName = 'fix:version-related-ids'; + static description = + 'Backfill/repair related_document_id on IsNewVersionOf / IsPreviousVersionOf references by resolving the target dataset via its DOI'; + + public static needsApplication = true; + + @flags.boolean({ alias: 'f', description: 'Apply changes. Without this flag the command runs as a dry run.' }) + public fix: boolean = false; + + @flags.boolean({ alias: 'v', description: 'Verbose output (per-reference detail)' }) + public verbose: boolean = false; + + @flags.number({ alias: 'p', description: 'Only process references owned by this publish_id' }) + public publish_id?: number; + + public static options: CommandOptions = { + startApp: true, + staysAlive: false, + }; + + // Only the version relations, both directions. + private readonly VERSION_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf']; + + async run() { + this.logger.info(`🔍 Scanning ${this.VERSION_RELATIONS.join(' / ')} references...`); + this.logger.info(this.fix ? '✏️ Mode: APPLY (changes will be written)' : '👀 Mode: DRY RUN (no changes written)'); + if (typeof this.publish_id === 'number') { + this.logger.info(`🎯 Filtering by owning publish_id: ${this.publish_id}`); + } + + try { + const query = DatasetReference.query() + .whereIn('relation', this.VERSION_RELATIONS) + .whereIn('type', ['DOI', 'URL']) + .where((q) => { + q.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%'); + }); + + // Restrict to references owned by a specific dataset (by publish_id), if requested. + if (typeof this.publish_id === 'number') { + query.whereHas('dataset', (d) => d.where('publish_id', this.publish_id as number)); + } + + const refs = await query.exec(); + this.logger.info(`🔗 Found ${refs.length} version reference(s) to inspect`); + + let alreadyCorrect = 0; + let filledFromNull = 0; + let correctedWrong = 0; + let unresolved = 0; + + for (const ref of refs) { + const target = await this.resolveTarget(ref); + + if (!target) { + unresolved++; + if (this.verbose) { + this.logger.warning(`⚠️ Reference ${ref.id}: could not resolve target (value: ${ref.value})`); + } + continue; + } + + // Never let a reference point at its own owning document. + if (target.id === ref.document_id) { + unresolved++; + if (this.verbose) { + this.logger.warning( + `⚠️ Reference ${ref.id}: target resolves to its own document (${ref.document_id}); skipping self-link`, + ); + } + continue; + } + + if (ref.related_document_id === target.id) { + alreadyCorrect++; + continue; + } + + const previous = ref.related_document_id; + const wasNull = previous === null || previous === undefined; + + if (this.fix) { + ref.related_document_id = target.id; + await ref.save(); + } + + if (wasNull) { + filledFromNull++; + } else { + correctedWrong++; + } + + if (this.verbose) { + const action = this.fix ? 'Updated' : '📝 Would update'; + this.logger.info( + `${action} reference ${ref.id} (doc ${ref.document_id}, ${ref.relation}): ` + + `related_document_id ${previous ?? 'NULL'} → ${target.id} (publish_id ${target.publish_id})`, + ); + } + } + + this.logger.info('────────────────────────────────────────'); + this.logger.info(`✔️ Already correct: ${alreadyCorrect}`); + this.logger.info(`➕ Filled from NULL: ${filledFromNull}`); + this.logger.info(`🔧 Corrected wrong value: ${correctedWrong}`); + this.logger.info(`⚠️ Unresolved/skipped: ${unresolved}`); + this.logger.info('────────────────────────────────────────'); + + const changes = filledFromNull + correctedWrong; + if (!this.fix && changes > 0) { + this.logger.info(`💡 Dry run only. Re-run with --fix to write ${changes} change(s).`); + } else if (this.fix) { + this.logger.success(`Done. ${changes} reference(s) updated.`); + } else { + this.logger.success('Nothing to change — all version references already linked correctly.'); + } + } catch (error) { + this.logger.error('Error fixing version related_document_id values:', error); + process.exit(1); + } + } + + /** + * Resolve the dataset a version reference points to. + * Prefers the DOI in `value` (reliable); falls back to a tethys publish_id URL. + */ + private async resolveTarget(ref: DatasetReference): Promise { + const doi = this.normalizeDoi(ref.value); + if (doi) { + const byDoi = await Dataset.query() + .whereHas('identifier', (q) => q.where('value', doi)) + .first(); + if (byDoi) return byDoi; + } + + const publishId = this.extractPublishId(ref.value); + if (publishId) { + const byPublishId = await Dataset.query().where('publish_id', publishId).first(); + if (byPublishId) return byPublishId; + } + + return null; + } + + /** + * Strip the resolver prefix so a reference value like + * "https://doi.org/10.24341/tethys.108.2" matches the identifier + * table value "10.24341/tethys.108.2". Returns null if it isn't a DOI. + */ + private normalizeDoi(value: string | null): string | null { + if (!value) return null; + const cleaned = value + .trim() + .replace(/^https?:\/\/(dx\.)?doi\.org\//i, '') + .replace(/^doi:/i, ''); + return /^10\.\d{4,}\//.test(cleaned) ? cleaned : null; + } + + private extractPublishId(value: string | null): number | null { + if (!value) return null; + const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/); + return urlMatch ? parseInt(urlMatch[1], 10) : null; + } +} \ No newline at end of file