/* |-------------------------------------------------------------------------- | node ace make:command list-updateable-datacite | DONE: create commands/list_updeatable_datacite.ts |-------------------------------------------------------------------------- */ import { BaseCommand, flags } from '@adonisjs/core/ace'; import { CommandOptions } from '@adonisjs/core/types/ace'; import Dataset from '#models/dataset'; import { DoiClient } from '#app/Library/Doi/DoiClient'; import env from '#start/env'; import logger from '@adonisjs/core/services/logger'; import { DateTime } from 'luxon'; import pLimit from 'p-limit'; export default class ListUpdateableDatacite extends BaseCommand { static commandName = 'list:updateable-datacite'; static description = 'List all datasets that need DataCite DOI updates'; public static needsApplication = true; // private chunkSize = 100; // Set chunk size for pagination @flags.boolean({ alias: 'v', description: 'Verbose output showing detailed information' }) public verbose: boolean = false; @flags.boolean({ alias: 'c', description: 'Show only count of updatable datasets' }) public countOnly: boolean = false; @flags.boolean({ alias: 'i', description: 'Show only publish IDs (useful for scripting)' }) public idsOnly: boolean = false; @flags.number({ description: 'Chunk size for processing datasets (default: 50)' }) public chunkSize: number = 50; //example: node ace list:updateable-datacite //example: node ace list:updateable-datacite --verbose //example: node ace list:updateable-datacite --count-only //example: node ace list:updateable-datacite --ids-only //example: node ace list:updateable-datacite --chunk-size 50 public static options: CommandOptions = { startApp: true, stayAlive: false, }; async run() { const prefix = env.get('DATACITE_PREFIX', ''); const base_domain = env.get('BASE_DOMAIN', ''); if (!prefix || !base_domain) { logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables'); return; } // Prevent conflicting flags if ((this.verbose && this.countOnly) || (this.verbose && this.idsOnly)) { logger.error('Flags --verbose cannot be combined with --count-only or --ids-only'); return; } const chunkSize = this.chunkSize || 50; let page = 1; let hasMoreDatasets = true; let totalProcessed = 0; const updatableDatasets: Dataset[] = []; if (!this.countOnly && !this.idsOnly) { logger.info(`Processing datasets in chunks of ${chunkSize}...`); } while (hasMoreDatasets) { const datasets = await this.getDatasets(page, chunkSize); if (datasets.length === 0) { hasMoreDatasets = false; break; } if (!this.countOnly && !this.idsOnly) { logger.info(`Processing chunk ${page} (${datasets.length} datasets)...`); } const chunkUpdatableDatasets = await this.processChunk(datasets); updatableDatasets.push(...chunkUpdatableDatasets); totalProcessed += datasets.length; page += 1; if (datasets.length < chunkSize) { hasMoreDatasets = false; } } if (!this.countOnly && !this.idsOnly) { logger.info(`Processed ${totalProcessed} datasets total, found ${updatableDatasets.length} that need updates`); } if (this.countOnly) { console.log(updatableDatasets.length); } else if (this.idsOnly) { updatableDatasets.forEach((dataset) => console.log(dataset.publish_id)); } else if (this.verbose) { await this.showVerboseOutput(updatableDatasets); } else { this.showSimpleOutput(updatableDatasets); } } /** * Processes a chunk of datasets to determine which ones need DataCite updates * * This method handles parallel processing of datasets within a chunk, providing * efficient error handling and filtering of results. * * @param datasets - Array of Dataset objects to process * @returns Promise - Array of datasets that need updates */ // private async processChunk(datasets: Dataset[]): Promise { // // Process datasets in parallel using Promise.allSettled for better error handling // // // // Why Promise.allSettled vs Promise.all? // // - Promise.all fails fast: if ANY promise rejects, the entire operation fails // // - Promise.allSettled waits for ALL promises: some can fail, others succeed // // - This is crucial for batch processing where we don't want one bad dataset // // to stop processing of the entire chunk // const results = await Promise.allSettled( // datasets.map(async (dataset) => { // try { // // Check if this specific dataset needs a DataCite update // const needsUpdate = await this.shouldUpdateDataset(dataset); // // Return the dataset if it needs update, null if it doesn't // // This creates a sparse array that we'll filter later // return needsUpdate ? dataset : null; // } catch (error) { // // Error handling for individual dataset checks // // // // Log warnings only if we're not in silent modes (count-only or ids-only) // // This prevents log spam when running automated scripts // if (!this.countOnly && !this.idsOnly) { // logger.warn(`Error checking dataset ${dataset.publish_id}: ${error.message}`); // } // // IMPORTANT DECISION: Return the dataset anyway if we can't determine status // // // // Why? It's safer to include a dataset that might not need updating // // than to miss one that actually does need updating. This follows the // // "fail-safe" principle - if we're unsure, err on the side of caution // return dataset; // } // }), // ); // // Filter and extract results from Promise.allSettled response // // // // Promise.allSettled returns an array of objects with this structure: // // - { status: 'fulfilled', value: T } for successful promises // // - { status: 'rejected', reason: Error } for failed promises // // // // We need to: // // 1. Only get fulfilled results (rejected ones are already handled above) // // 2. Filter out null values (datasets that don't need updates) // // 3. Extract the actual Dataset objects from the wrapper // return results // .filter( // (result): result is PromiseFulfilledResult => // // Type guard: only include fulfilled results that have actual values // // This filters out: // // - Rejected promises (shouldn't happen due to try/catch, but safety first) // // - Fulfilled promises that returned null (datasets that don't need updates) // result.status === 'fulfilled' && result.value !== null, // ) // .map((result) => result.value!); // Extract the Dataset from the wrapper // // The ! is safe here because we filtered out null values above // } private async processChunk(datasets: Dataset[]): Promise { // Limit concurrency to avoid API flooding (e.g., max 5 at once) const limit = pLimit(5); const tasks = datasets.map((dataset) => limit(async () => { try { const needsUpdate = await this.shouldUpdateDataset(dataset); return needsUpdate ? dataset : null; } catch (error) { if (!this.countOnly && !this.idsOnly) { logger.warn( `Error checking dataset ${dataset.publish_id}: ${ error instanceof Error ? error.message : JSON.stringify(error) }`, ); } // Fail-safe: include dataset if uncertain return dataset; } }), ); const results = await Promise.allSettled(tasks); return results .filter((result): result is PromiseFulfilledResult => result.status === 'fulfilled' && result.value !== null) .map((result) => result.value!); } private async getDatasets(page: number, chunkSize: number): Promise { return await Dataset.query() .orderBy('publish_id', 'asc') .preload('identifier') .preload('xmlCache') .preload('titles') .where('server_state', 'published') .whereHas('identifier', (identifierQuery) => { identifierQuery.where('type', 'doi'); }) .forPage(page, chunkSize); // Get files for the current page } private async shouldUpdateDataset(dataset: Dataset): Promise { try { let doiIdentifier = dataset.identifier; if (!doiIdentifier) { await dataset.load('identifier'); doiIdentifier = dataset.identifier; } if (!doiIdentifier || doiIdentifier.type !== 'doi') { return false; } const datasetModified = dataset.server_date_modified instanceof DateTime ? dataset.server_date_modified : DateTime.fromJSDate(dataset.server_date_modified); if (!datasetModified) { return true; } if (datasetModified > DateTime.now()) { return false; } const doiClient = new DoiClient(); const DOI_CHECK_TIMEOUT = 300; // ms const doiLastModified = await Promise.race([ doiClient.getDoiLastModified(doiIdentifier.value), this.createTimeoutPromise(DOI_CHECK_TIMEOUT), ]).catch(() => null); if (!doiLastModified) { // If uncertain, better include dataset for update return true; } const doiModified = DateTime.fromJSDate(doiLastModified); if (datasetModified > doiModified) { const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds); const toleranceSeconds = 600; return diffInSeconds > toleranceSeconds; } return false; } catch (error) { return true; // safer: include dataset if unsure } } /** * Create a timeout promise for API calls */ private createTimeoutPromise(timeoutMs: number): Promise { return new Promise((_, reject) => { setTimeout(() => reject(new Error(`API call timeout after ${timeoutMs}ms`)), timeoutMs); }); } private showSimpleOutput(updatableDatasets: Dataset[]): void { if (updatableDatasets.length === 0) { console.log('No datasets need DataCite updates.'); return; } console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`); updatableDatasets.forEach((dataset) => { console.log(`publish_id ${dataset.publish_id} needs update - ${dataset.mainTitle || 'Untitled'}`); }); console.log(`\nTo update these datasets, run:`); console.log(` node ace update:datacite`); console.log(`\nOr update specific datasets:`); console.log(` node ace update:datacite -p `); } private async showVerboseOutput(updatableDatasets: Dataset[]): Promise { if (updatableDatasets.length === 0) { console.log('No datasets need DataCite updates.'); return; } console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`); for (const dataset of updatableDatasets) { await this.showDatasetDetails(dataset); } console.log(`\nSummary: ${updatableDatasets.length} datasets need updates`); } private async showDatasetDetails(dataset: Dataset): Promise { try { let doiIdentifier = dataset.identifier; if (!doiIdentifier) { await dataset.load('identifier'); doiIdentifier = dataset.identifier; } const doiValue = doiIdentifier?.value || 'N/A'; const datasetModified = dataset.server_date_modified; // Get DOI info from DataCite const doiClient = new DoiClient(); const doiLastModified = await doiClient.getDoiLastModified(doiValue); const doiState = await doiClient.getDoiState(doiValue); console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`); console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`); console.log(`│ DOI: ${doiValue}`); console.log(`│ DOI State: ${doiState || 'Unknown'}`); console.log(`│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}`); console.log(`│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}`); console.log(`│ Status: NEEDS UPDATE`); console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`); } catch (error) { console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`); console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`); console.log(`│ DOI: ${dataset.identifier?.value || 'N/A'}`); console.log(`│ Error: ${error.message}`); console.log(`│ Status: NEEDS UPDATE (Error checking)`); console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`); } } }