- fix: Update API routes to include DOI URL handling and improve route organization - chore: Add ORCID preload rule file and ensure proper registration - docs: Add MIT License to the project for open-source compliance - feat: Implement command to detect and fix missing dataset cross-references - feat: Create command for updating DataCite DOI records with detailed logging and error handling - docs: Add comprehensive documentation for dataset indexing command - docs: Create detailed documentation for DataCite update command with usage examples and error handling
271 lines
12 KiB
TypeScript
271 lines
12 KiB
TypeScript
/*
|
|
|--------------------------------------------------------------------------
|
|
| node ace make:command update-datacite
|
|
| DONE: create commands/update_datacite.ts
|
|
|--------------------------------------------------------------------------
|
|
*/
|
|
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
|
import { CommandOptions } from '@adonisjs/core/types/ace';
|
|
import Dataset from '#models/dataset';
|
|
import { DoiClient } from '#app/Library/Doi/DoiClient';
|
|
import DoiClientException from '#app/exceptions/DoiClientException';
|
|
import Index from '#app/Library/Utils/Index';
|
|
import env from '#start/env';
|
|
import logger from '@adonisjs/core/services/logger';
|
|
import { DateTime } from 'luxon';
|
|
import { getDomain } from '#app/utils/utility-functions';
|
|
|
|
export default class UpdateDatacite extends BaseCommand {
|
|
static commandName = 'update:datacite';
|
|
static description = 'Update DataCite DOI records for published datasets';
|
|
|
|
public static needsApplication = true;
|
|
|
|
@flags.number({ alias: 'p', description: 'Specific publish_id to update' })
|
|
public publish_id: number;
|
|
|
|
@flags.boolean({ alias: 'f', description: 'Force update all records regardless of modification date' })
|
|
public force: boolean = false;
|
|
|
|
@flags.boolean({ alias: 'd', description: 'Dry run - show what would be updated without making changes' })
|
|
public dryRun: boolean = false;
|
|
|
|
@flags.boolean({ alias: 's', description: 'Show detailed stats for each dataset that needs updating' })
|
|
public stats: boolean = false;
|
|
|
|
//example: node ace update:datacite -p 123 --force --dry-run
|
|
|
|
public static options: CommandOptions = {
|
|
startApp: true, // Whether to boot the application before running the command
|
|
stayAlive: false, // Whether to keep the process alive after the command has executed
|
|
};
|
|
|
|
async run() {
|
|
logger.info('Starting DataCite update process...');
|
|
|
|
const prefix = env.get('DATACITE_PREFIX', '');
|
|
const base_domain = env.get('BASE_DOMAIN', '');
|
|
const apiUrl = env.get('DATACITE_API_URL', 'https://api.datacite.org');
|
|
|
|
if (!prefix || !base_domain) {
|
|
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
|
|
return;
|
|
}
|
|
|
|
logger.info(`Using DataCite API: ${apiUrl}`);
|
|
|
|
const datasets = await this.getDatasets();
|
|
logger.info(`Found ${datasets.length} datasets to process`);
|
|
|
|
let updated = 0;
|
|
let skipped = 0;
|
|
let errors = 0;
|
|
|
|
for (const dataset of datasets) {
|
|
try {
|
|
const shouldUpdate = this.force || (await this.shouldUpdateDataset(dataset));
|
|
|
|
if (this.stats) {
|
|
// Stats mode: show detailed information for datasets that need updating
|
|
if (shouldUpdate) {
|
|
await this.showDatasetStats(dataset);
|
|
updated++;
|
|
} else {
|
|
skipped++;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (!shouldUpdate) {
|
|
logger.info(`Dataset ${dataset.publish_id}: Up to date, skipping`);
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
if (this.dryRun) {
|
|
logger.info(`Dataset ${dataset.publish_id}: Would update DataCite record (dry run)`);
|
|
updated++;
|
|
continue;
|
|
}
|
|
|
|
await this.updateDataciteRecord(dataset, prefix, base_domain);
|
|
logger.info(`Dataset ${dataset.publish_id}: Successfully updated DataCite record`);
|
|
updated++;
|
|
} catch (error) {
|
|
logger.error(`Dataset ${dataset.publish_id}: Failed to update - ${error.message}`);
|
|
errors++;
|
|
}
|
|
}
|
|
|
|
if (this.stats) {
|
|
logger.info(`\nDataCite Stats Summary: ${updated} datasets need updating, ${skipped} are up to date`);
|
|
} else {
|
|
logger.info(`DataCite update completed. Updated: ${updated}, Skipped: ${skipped}, Errors: ${errors}`);
|
|
}
|
|
}
|
|
|
|
private async getDatasets(): Promise<Dataset[]> {
|
|
const query = Dataset.query()
|
|
.preload('identifier')
|
|
.preload('xmlCache')
|
|
.where('server_state', 'published')
|
|
.whereHas('identifier', (identifierQuery) => {
|
|
identifierQuery.where('type', 'doi');
|
|
});
|
|
|
|
if (this.publish_id) {
|
|
query.where('publish_id', this.publish_id);
|
|
}
|
|
|
|
return await query.exec();
|
|
}
|
|
|
|
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
|
|
try {
|
|
// Check if dataset has a DOI identifier (HasOne relationship)
|
|
let doiIdentifier = dataset.identifier;
|
|
|
|
if (!doiIdentifier) {
|
|
// Try to load the relationship if not already loaded
|
|
await dataset.load('identifier');
|
|
doiIdentifier = dataset.identifier;
|
|
}
|
|
|
|
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
|
logger.warn(`Dataset ${dataset.publish_id}: No DOI identifier found`);
|
|
return false;
|
|
}
|
|
|
|
// Validate dataset modification date
|
|
const datasetModified = dataset.server_date_modified;
|
|
const now = DateTime.now();
|
|
|
|
if (!datasetModified) {
|
|
logger.error(`Dataset ${dataset.publish_id}: server_date_modified is null or undefined`);
|
|
return true; // Update anyway if modification date is missing
|
|
}
|
|
|
|
if (datasetModified > now) {
|
|
logger.error(
|
|
`Dataset ${dataset.publish_id}: server_date_modified (${datasetModified.toISO()}) is in the future! ` +
|
|
`Current time: ${now.toISO()}. This indicates a data integrity issue. Skipping update.`,
|
|
);
|
|
return false; // Do not update when modification date is invalid
|
|
}
|
|
|
|
// Get DOI information from DataCite using DoiClient
|
|
const doiClient = new DoiClient();
|
|
const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);
|
|
|
|
if (!doiLastModified) {
|
|
logger.warn(`Dataset ${dataset.publish_id}: Could not retrieve DOI modification date from DataCite`);
|
|
return true; // Update anyway if we can't get DOI info
|
|
}
|
|
|
|
// Compare dataset modification date with DOI modification date
|
|
const doiModified = DateTime.fromJSDate(doiLastModified);
|
|
|
|
logger.debug(
|
|
`Dataset ${dataset.publish_id}: Dataset modified: ${datasetModified.toISO()}, DOI modified: ${doiModified.toISO()}`,
|
|
);
|
|
|
|
// Update if dataset was modified after the DOI record
|
|
return datasetModified > doiModified;
|
|
} catch (error) {
|
|
logger.warn(`Error checking update status for dataset ${dataset.publish_id}: ${error.message}`);
|
|
return true; // Update anyway if we can't determine status
|
|
}
|
|
}
|
|
|
|
private async updateDataciteRecord(dataset: Dataset, prefix: string, base_domain: string): Promise<void> {
|
|
try {
|
|
// Get the DOI identifier (HasOne relationship)
|
|
let doiIdentifier = dataset.identifier;
|
|
|
|
if (!doiIdentifier) {
|
|
await dataset.load('identifier');
|
|
doiIdentifier = dataset.identifier;
|
|
}
|
|
|
|
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
|
throw new Error('No DOI identifier found for dataset');
|
|
}
|
|
|
|
// Generate XML metadata
|
|
const xmlMeta = (await Index.getDoiRegisterString(dataset)) as string;
|
|
if (!xmlMeta) {
|
|
throw new Error('Failed to generate XML metadata');
|
|
}
|
|
|
|
// Construct DOI value and landing page URL
|
|
const doiValue = doiIdentifier.value; // Use existing DOI value
|
|
const landingPageUrl = `https://doi.${getDomain(base_domain)}/${doiValue}`;
|
|
|
|
// Update DataCite record
|
|
const doiClient = new DoiClient();
|
|
const dataciteResponse = await doiClient.registerDoi(doiValue, xmlMeta, landingPageUrl);
|
|
|
|
if (dataciteResponse?.status === 201) {
|
|
// // Update dataset modification date
|
|
// dataset.server_date_modified = DateTime.now();
|
|
// await dataset.save();
|
|
|
|
// // Update search index
|
|
// const index_name = 'tethys-records';
|
|
// await Index.indexDocument(dataset, index_name);
|
|
|
|
logger.debug(`Dataset ${dataset.publish_id}: DataCite record and search index updated successfully`);
|
|
} else {
|
|
throw new DoiClientException(
|
|
dataciteResponse?.status || 500,
|
|
`Unexpected DataCite response code: ${dataciteResponse?.status}`,
|
|
);
|
|
}
|
|
} catch (error) {
|
|
if (error instanceof DoiClientException) {
|
|
throw error;
|
|
}
|
|
throw new Error(`Failed to update DataCite record: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Shows detailed statistics for a dataset that needs updating
|
|
*/
|
|
private async showDatasetStats(dataset: Dataset): Promise<void> {
|
|
try {
|
|
let doiIdentifier = dataset.identifier;
|
|
|
|
if (!doiIdentifier) {
|
|
await dataset.load('identifier');
|
|
doiIdentifier = dataset.identifier;
|
|
}
|
|
|
|
const doiValue = doiIdentifier?.value || 'N/A';
|
|
const doiStatus = doiIdentifier?.status || 'N/A';
|
|
const datasetModified = dataset.server_date_modified;
|
|
|
|
// Get DOI info from DataCite
|
|
const doiClient = new DoiClient();
|
|
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
|
|
const doiState = await doiClient.getDoiState(doiValue);
|
|
|
|
console.log(`
|
|
┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────
|
|
│ DOI Value: ${doiValue}
|
|
│ DOI Status (DB): ${doiStatus}
|
|
│ DOI State (DataCite): ${doiState || 'Unknown'}
|
|
│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}
|
|
│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}
|
|
│ Needs Update: YES - Dataset newer than DOI
|
|
└─────────────────────────────────────────────────────────────────────────────────────────────`);
|
|
} catch (error) {
|
|
console.log(`
|
|
┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────
|
|
│ DOI Value: ${dataset.identifier?.value || 'N/A'}
|
|
│ Error: ${error.message}
|
|
│ Needs Update: YES - Error checking status
|
|
└─────────────────────────────────────────────────────────────────────────────────────────────`);
|
|
}
|
|
}
|
|
}
|