- Updated docker-entrypoint.sh to improve ClamAV service initialization and logging. - Added checks for ClamAV and freshclam daemon status. - Optimized freshclam configuration for container usage, including logging to stdout and setting database directory. - Introduced caching mechanism for enabled file extensions in vinejs_provider.ts to reduce database queries. - Implemented a new command to list datasets needing DataCite DOI updates, with options for verbose output, count only, and IDs only. - Updated package dependencies to include p-limit and pino-pretty. - finalized ace command 'detect:missing-cross-references'
346 lines
15 KiB
TypeScript
346 lines
15 KiB
TypeScript
/*
|
|
|--------------------------------------------------------------------------
|
|
| node ace make:command list-updateable-datacite
|
|
| DONE: create commands/list_updeatable_datacite.ts
|
|
|--------------------------------------------------------------------------
|
|
*/
|
|
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
|
import { CommandOptions } from '@adonisjs/core/types/ace';
|
|
import Dataset from '#models/dataset';
|
|
import { DoiClient } from '#app/Library/Doi/DoiClient';
|
|
import env from '#start/env';
|
|
import logger from '@adonisjs/core/services/logger';
|
|
import { DateTime } from 'luxon';
|
|
import pLimit from 'p-limit';
|
|
|
|
export default class ListUpdateableDatacite extends BaseCommand {
|
|
static commandName = 'list:updateable-datacite';
|
|
static description = 'List all datasets that need DataCite DOI updates';
|
|
|
|
public static needsApplication = true;
|
|
|
|
// private chunkSize = 100; // Set chunk size for pagination
|
|
|
|
@flags.boolean({ alias: 'v', description: 'Verbose output showing detailed information' })
|
|
public verbose: boolean = false;
|
|
|
|
@flags.boolean({ alias: 'c', description: 'Show only count of updatable datasets' })
|
|
public countOnly: boolean = false;
|
|
|
|
@flags.boolean({ alias: 'i', description: 'Show only publish IDs (useful for scripting)' })
|
|
public idsOnly: boolean = false;
|
|
|
|
@flags.number({ description: 'Chunk size for processing datasets (default: 50)' })
|
|
public chunkSize: number = 50;
|
|
|
|
//example: node ace list:updateable-datacite
|
|
//example: node ace list:updateable-datacite --verbose
|
|
//example: node ace list:updateable-datacite --count-only
|
|
//example: node ace list:updateable-datacite --ids-only
|
|
//example: node ace list:updateable-datacite --chunk-size 50
|
|
|
|
public static options: CommandOptions = {
|
|
startApp: true,
|
|
stayAlive: false,
|
|
};
|
|
|
|
async run() {
|
|
const prefix = env.get('DATACITE_PREFIX', '');
|
|
const base_domain = env.get('BASE_DOMAIN', '');
|
|
|
|
if (!prefix || !base_domain) {
|
|
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
|
|
return;
|
|
}
|
|
|
|
// Prevent conflicting flags
|
|
if ((this.verbose && this.countOnly) || (this.verbose && this.idsOnly)) {
|
|
logger.error('Flags --verbose cannot be combined with --count-only or --ids-only');
|
|
return;
|
|
}
|
|
|
|
const chunkSize = this.chunkSize || 50;
|
|
let page = 1;
|
|
let hasMoreDatasets = true;
|
|
let totalProcessed = 0;
|
|
const updatableDatasets: Dataset[] = [];
|
|
|
|
if (!this.countOnly && !this.idsOnly) {
|
|
logger.info(`Processing datasets in chunks of ${chunkSize}...`);
|
|
}
|
|
|
|
while (hasMoreDatasets) {
|
|
const datasets = await this.getDatasets(page, chunkSize);
|
|
|
|
if (datasets.length === 0) {
|
|
hasMoreDatasets = false;
|
|
break;
|
|
}
|
|
|
|
if (!this.countOnly && !this.idsOnly) {
|
|
logger.info(`Processing chunk ${page} (${datasets.length} datasets)...`);
|
|
}
|
|
|
|
const chunkUpdatableDatasets = await this.processChunk(datasets);
|
|
updatableDatasets.push(...chunkUpdatableDatasets);
|
|
totalProcessed += datasets.length;
|
|
|
|
page += 1;
|
|
if (datasets.length < chunkSize) {
|
|
hasMoreDatasets = false;
|
|
}
|
|
}
|
|
|
|
if (!this.countOnly && !this.idsOnly) {
|
|
logger.info(`Processed ${totalProcessed} datasets total, found ${updatableDatasets.length} that need updates`);
|
|
}
|
|
|
|
if (this.countOnly) {
|
|
console.log(updatableDatasets.length);
|
|
} else if (this.idsOnly) {
|
|
updatableDatasets.forEach((dataset) => console.log(dataset.publish_id));
|
|
} else if (this.verbose) {
|
|
await this.showVerboseOutput(updatableDatasets);
|
|
} else {
|
|
this.showSimpleOutput(updatableDatasets);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Processes a chunk of datasets to determine which ones need DataCite updates
|
|
*
|
|
* This method handles parallel processing of datasets within a chunk, providing
|
|
* efficient error handling and filtering of results.
|
|
*
|
|
* @param datasets - Array of Dataset objects to process
|
|
* @returns Promise<Dataset[]> - Array of datasets that need updates
|
|
*/
|
|
// private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
|
|
// // Process datasets in parallel using Promise.allSettled for better error handling
|
|
// //
|
|
// // Why Promise.allSettled vs Promise.all?
|
|
// // - Promise.all fails fast: if ANY promise rejects, the entire operation fails
|
|
// // - Promise.allSettled waits for ALL promises: some can fail, others succeed
|
|
// // - This is crucial for batch processing where we don't want one bad dataset
|
|
// // to stop processing of the entire chunk
|
|
// const results = await Promise.allSettled(
|
|
// datasets.map(async (dataset) => {
|
|
// try {
|
|
// // Check if this specific dataset needs a DataCite update
|
|
// const needsUpdate = await this.shouldUpdateDataset(dataset);
|
|
|
|
// // Return the dataset if it needs update, null if it doesn't
|
|
// // This creates a sparse array that we'll filter later
|
|
// return needsUpdate ? dataset : null;
|
|
// } catch (error) {
|
|
// // Error handling for individual dataset checks
|
|
// //
|
|
// // Log warnings only if we're not in silent modes (count-only or ids-only)
|
|
// // This prevents log spam when running automated scripts
|
|
// if (!this.countOnly && !this.idsOnly) {
|
|
// logger.warn(`Error checking dataset ${dataset.publish_id}: ${error.message}`);
|
|
// }
|
|
|
|
// // IMPORTANT DECISION: Return the dataset anyway if we can't determine status
|
|
// //
|
|
// // Why? It's safer to include a dataset that might not need updating
|
|
// // than to miss one that actually does need updating. This follows the
|
|
// // "fail-safe" principle - if we're unsure, err on the side of caution
|
|
// return dataset;
|
|
// }
|
|
// }),
|
|
// );
|
|
|
|
// // Filter and extract results from Promise.allSettled response
|
|
// //
|
|
// // Promise.allSettled returns an array of objects with this structure:
|
|
// // - { status: 'fulfilled', value: T } for successful promises
|
|
// // - { status: 'rejected', reason: Error } for failed promises
|
|
// //
|
|
// // We need to:
|
|
// // 1. Only get fulfilled results (rejected ones are already handled above)
|
|
// // 2. Filter out null values (datasets that don't need updates)
|
|
// // 3. Extract the actual Dataset objects from the wrapper
|
|
// return results
|
|
// .filter(
|
|
// (result): result is PromiseFulfilledResult<Dataset | null> =>
|
|
// // Type guard: only include fulfilled results that have actual values
|
|
// // This filters out:
|
|
// // - Rejected promises (shouldn't happen due to try/catch, but safety first)
|
|
// // - Fulfilled promises that returned null (datasets that don't need updates)
|
|
// result.status === 'fulfilled' && result.value !== null,
|
|
// )
|
|
// .map((result) => result.value!); // Extract the Dataset from the wrapper
|
|
// // The ! is safe here because we filtered out null values above
|
|
// }
|
|
|
|
private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
|
|
// Limit concurrency to avoid API flooding (e.g., max 5 at once)
|
|
const limit = pLimit(5);
|
|
|
|
const tasks = datasets.map((dataset) =>
|
|
limit(async () => {
|
|
try {
|
|
const needsUpdate = await this.shouldUpdateDataset(dataset);
|
|
return needsUpdate ? dataset : null;
|
|
} catch (error) {
|
|
if (!this.countOnly && !this.idsOnly) {
|
|
logger.warn(
|
|
`Error checking dataset ${dataset.publish_id}: ${
|
|
error instanceof Error ? error.message : JSON.stringify(error)
|
|
}`,
|
|
);
|
|
}
|
|
// Fail-safe: include dataset if uncertain
|
|
return dataset;
|
|
}
|
|
}),
|
|
);
|
|
|
|
const results = await Promise.allSettled(tasks);
|
|
|
|
return results
|
|
.filter((result): result is PromiseFulfilledResult<Dataset | null> => result.status === 'fulfilled' && result.value !== null)
|
|
.map((result) => result.value!);
|
|
}
|
|
|
|
private async getDatasets(page: number, chunkSize: number): Promise<Dataset[]> {
|
|
return await Dataset.query()
|
|
.orderBy('publish_id', 'asc')
|
|
.preload('identifier')
|
|
.preload('xmlCache')
|
|
.preload('titles')
|
|
.where('server_state', 'published')
|
|
.whereHas('identifier', (identifierQuery) => {
|
|
identifierQuery.where('type', 'doi');
|
|
})
|
|
.forPage(page, chunkSize); // Get files for the current page
|
|
}
|
|
|
|
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
|
|
try {
|
|
let doiIdentifier = dataset.identifier;
|
|
if (!doiIdentifier) {
|
|
await dataset.load('identifier');
|
|
doiIdentifier = dataset.identifier;
|
|
}
|
|
|
|
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
|
return false;
|
|
}
|
|
|
|
const datasetModified =
|
|
dataset.server_date_modified instanceof DateTime
|
|
? dataset.server_date_modified
|
|
: DateTime.fromJSDate(dataset.server_date_modified);
|
|
|
|
if (!datasetModified) {
|
|
return true;
|
|
}
|
|
|
|
if (datasetModified > DateTime.now()) {
|
|
return false;
|
|
}
|
|
|
|
const doiClient = new DoiClient();
|
|
const DOI_CHECK_TIMEOUT = 300; // ms
|
|
|
|
const doiLastModified = await Promise.race([
|
|
doiClient.getDoiLastModified(doiIdentifier.value),
|
|
this.createTimeoutPromise(DOI_CHECK_TIMEOUT),
|
|
]).catch(() => null);
|
|
|
|
if (!doiLastModified) {
|
|
// If uncertain, better include dataset for update
|
|
return true;
|
|
}
|
|
|
|
const doiModified = DateTime.fromJSDate(doiLastModified);
|
|
if (datasetModified > doiModified) {
|
|
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
|
|
const toleranceSeconds = 600;
|
|
return diffInSeconds > toleranceSeconds;
|
|
}
|
|
return false;
|
|
} catch (error) {
|
|
return true; // safer: include dataset if unsure
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create a timeout promise for API calls
|
|
*/
|
|
private createTimeoutPromise(timeoutMs: number): Promise<never> {
|
|
return new Promise((_, reject) => {
|
|
setTimeout(() => reject(new Error(`API call timeout after ${timeoutMs}ms`)), timeoutMs);
|
|
});
|
|
}
|
|
|
|
private showSimpleOutput(updatableDatasets: Dataset[]): void {
|
|
if (updatableDatasets.length === 0) {
|
|
console.log('No datasets need DataCite updates.');
|
|
return;
|
|
}
|
|
|
|
console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
|
|
|
|
updatableDatasets.forEach((dataset) => {
|
|
console.log(`publish_id ${dataset.publish_id} needs update - ${dataset.mainTitle || 'Untitled'}`);
|
|
});
|
|
|
|
console.log(`\nTo update these datasets, run:`);
|
|
console.log(` node ace update:datacite`);
|
|
console.log(`\nOr update specific datasets:`);
|
|
console.log(` node ace update:datacite -p <publish_id>`);
|
|
}
|
|
|
|
private async showVerboseOutput(updatableDatasets: Dataset[]): Promise<void> {
|
|
if (updatableDatasets.length === 0) {
|
|
console.log('No datasets need DataCite updates.');
|
|
return;
|
|
}
|
|
|
|
console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
|
|
|
|
for (const dataset of updatableDatasets) {
|
|
await this.showDatasetDetails(dataset);
|
|
}
|
|
|
|
console.log(`\nSummary: ${updatableDatasets.length} datasets need updates`);
|
|
}
|
|
|
|
private async showDatasetDetails(dataset: Dataset): Promise<void> {
|
|
try {
|
|
let doiIdentifier = dataset.identifier;
|
|
|
|
if (!doiIdentifier) {
|
|
await dataset.load('identifier');
|
|
doiIdentifier = dataset.identifier;
|
|
}
|
|
|
|
const doiValue = doiIdentifier?.value || 'N/A';
|
|
const datasetModified = dataset.server_date_modified;
|
|
|
|
// Get DOI info from DataCite
|
|
const doiClient = new DoiClient();
|
|
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
|
|
const doiState = await doiClient.getDoiState(doiValue);
|
|
|
|
console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
|
|
console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`);
|
|
console.log(`│ DOI: ${doiValue}`);
|
|
console.log(`│ DOI State: ${doiState || 'Unknown'}`);
|
|
console.log(`│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}`);
|
|
console.log(`│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}`);
|
|
console.log(`│ Status: NEEDS UPDATE`);
|
|
console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
|
|
} catch (error) {
|
|
console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
|
|
console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`);
|
|
console.log(`│ DOI: ${dataset.identifier?.value || 'N/A'}`);
|
|
console.log(`│ Error: ${error.message}`);
|
|
console.log(`│ Status: NEEDS UPDATE (Error checking)`);
|
|
console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
|
|
}
|
|
}
|
|
}
|