tethys.backend/commands/update_datacite.ts
Arno Kaimbacher 6757bdb77c feat: Enhance ClamAV Docker entrypoint and configuration
- Updated docker-entrypoint.sh to improve ClamAV service initialization and logging.
- Added checks for ClamAV and freshclam daemon status.
- Optimized freshclam configuration for container usage, including logging to stdout and setting database directory.
- Introduced caching mechanism for enabled file extensions in vinejs_provider.ts to reduce database queries.
- Implemented a new command to list datasets needing DataCite DOI updates, with options for verbose output, count only, and IDs only.
- Updated package dependencies to include p-limit and pino-pretty.
- finalized ace command 'detect:missing-cross-references'
2025-09-26 12:19:35 +02:00

266 lines
11 KiB
TypeScript

/*
|--------------------------------------------------------------------------
| node ace make:command update-datacite
| DONE: create commands/update_datacite.ts
|--------------------------------------------------------------------------
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import { CommandOptions } from '@adonisjs/core/types/ace';
import Dataset from '#models/dataset';
import { DoiClient } from '#app/Library/Doi/DoiClient';
import DoiClientException from '#app/exceptions/DoiClientException';
import Index from '#app/Library/Utils/Index';
import env from '#start/env';
import logger from '@adonisjs/core/services/logger';
import { DateTime } from 'luxon';
import { getDomain } from '#app/utils/utility-functions';
export default class UpdateDatacite extends BaseCommand {
static commandName = 'update:datacite';
static description = 'Update DataCite DOI records for published datasets';
public static needsApplication = true;
@flags.number({ alias: 'p', description: 'Specific publish_id to update' })
public publish_id: number;
@flags.boolean({ alias: 'f', description: 'Force update all records regardless of modification date' })
public force: boolean = false;
@flags.boolean({ alias: 'd', description: 'Dry run - show what would be updated without making changes' })
public dryRun: boolean = false;
@flags.boolean({ alias: 's', description: 'Show detailed stats for each dataset that needs updating' })
public stats: boolean = false;
//example: node ace update:datacite -p 123 --force --dry-run
public static options: CommandOptions = {
startApp: true, // Whether to boot the application before running the command
stayAlive: false, // Whether to keep the process alive after the command has executed
};
async run() {
logger.info('Starting DataCite update process...');
const prefix = env.get('DATACITE_PREFIX', '');
const base_domain = env.get('BASE_DOMAIN', '');
const apiUrl = env.get('DATACITE_API_URL', 'https://api.datacite.org');
if (!prefix || !base_domain) {
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
return;
}
logger.info(`Using DataCite API: ${apiUrl}`);
const datasets = await this.getDatasets();
logger.info(`Found ${datasets.length} datasets to process`);
let updated = 0;
let skipped = 0;
let errors = 0;
for (const dataset of datasets) {
try {
const shouldUpdate = this.force || (await this.shouldUpdateDataset(dataset));
if (this.stats) {
// Stats mode: show detailed information for datasets that need updating
if (shouldUpdate) {
await this.showDatasetStats(dataset);
updated++;
} else {
skipped++;
}
continue;
}
if (!shouldUpdate) {
logger.info(`Dataset ${dataset.publish_id}: Up to date, skipping`);
skipped++;
continue;
}
if (this.dryRun) {
logger.info(`Dataset ${dataset.publish_id}: Would update DataCite record (dry run)`);
updated++;
continue;
}
await this.updateDataciteRecord(dataset, prefix, base_domain);
logger.info(`Dataset ${dataset.publish_id}: Successfully updated DataCite record`);
updated++;
} catch (error) {
logger.error(`Dataset ${dataset.publish_id}: Failed to update - ${error.message}`);
errors++;
}
}
if (this.stats) {
logger.info(`\nDataCite Stats Summary: ${updated} datasets need updating, ${skipped} are up to date`);
} else {
logger.info(`DataCite update completed. Updated: ${updated}, Skipped: ${skipped}, Errors: ${errors}`);
}
}
private async getDatasets(): Promise<Dataset[]> {
const query = Dataset.query()
.preload('identifier')
.preload('xmlCache')
.where('server_state', 'published')
.whereHas('identifier', (identifierQuery) => {
identifierQuery.where('type', 'doi');
});
if (this.publish_id) {
query.where('publish_id', this.publish_id);
}
return await query.exec();
}
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
try {
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
return false;
}
const datasetModified = dataset.server_date_modified;
const now = DateTime.now();
if (!datasetModified) {
return true; // Update if modification date is missing
}
if (datasetModified > now) {
return false; // Skip invalid future dates
}
// Check DataCite DOI modification date
const doiClient = new DoiClient();
const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);
if (!doiLastModified) {
return false; // not Update if we can't get DOI info
}
const doiModified = DateTime.fromJSDate(doiLastModified);
if (datasetModified > doiModified) {
// if dataset was modified after DOI creation
// Calculate the difference in seconds
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
// Define tolerance threshold (60 seconds = 1 minute)
const toleranceSeconds = 60;
// Only update if the difference is greater than the tolerance
// This prevents unnecessary updates for minor timestamp differences
return diffInSeconds > toleranceSeconds;
} else {
return false; // No update needed
}
} catch (error) {
return false; // not update if we can't determine status or other error
}
}
private async updateDataciteRecord(dataset: Dataset, prefix: string, base_domain: string): Promise<void> {
try {
// Get the DOI identifier (HasOne relationship)
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
throw new Error('No DOI identifier found for dataset');
}
// Generate XML metadata
const xmlMeta = (await Index.getDoiRegisterString(dataset)) as string;
if (!xmlMeta) {
throw new Error('Failed to generate XML metadata');
}
// Construct DOI value and landing page URL
const doiValue = doiIdentifier.value; // Use existing DOI value
const landingPageUrl = `https://doi.${getDomain(base_domain)}/${doiValue}`;
// Update DataCite record
const doiClient = new DoiClient();
const dataciteResponse = await doiClient.registerDoi(doiValue, xmlMeta, landingPageUrl);
if (dataciteResponse?.status === 201) {
// // Update dataset modification date
// dataset.server_date_modified = DateTime.now();
// await dataset.save();
// // Update search index
// const index_name = 'tethys-records';
// await Index.indexDocument(dataset, index_name);
logger.debug(`Dataset ${dataset.publish_id}: DataCite record and search index updated successfully`);
} else {
throw new DoiClientException(
dataciteResponse?.status || 500,
`Unexpected DataCite response code: ${dataciteResponse?.status}`,
);
}
} catch (error) {
if (error instanceof DoiClientException) {
throw error;
}
throw new Error(`Failed to update DataCite record: ${error.message}`);
}
}
/**
* Shows detailed statistics for a dataset that needs updating
*/
private async showDatasetStats(dataset: Dataset): Promise<void> {
try {
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
const doiValue = doiIdentifier?.value || 'N/A';
const doiStatus = doiIdentifier?.status || 'N/A';
const datasetModified = dataset.server_date_modified;
// Get DOI info from DataCite
const doiClient = new DoiClient();
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
const doiState = await doiClient.getDoiState(doiValue);
console.log(`
┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────
│ DOI Value: ${doiValue}
│ DOI Status (DB): ${doiStatus}
│ DOI State (DataCite): ${doiState || 'Unknown'}
│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}
│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}
│ Needs Update: YES - Dataset newer than DOI
└─────────────────────────────────────────────────────────────────────────────────────────────`);
} catch (error) {
console.log(`
┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────
│ DOI Value: ${dataset.identifier?.value || 'N/A'}
│ Error: ${error.message}
│ Needs Update: YES - Error checking status
└─────────────────────────────────────────────────────────────────────────────────────────────`);
}
}
}