tethys.backend/commands/list_updatable_datacite.ts

/*
|--------------------------------------------------------------------------
| node ace make:command list-updateable-datacite
| DONE:    create commands/list_updeatable_datacite.ts
|--------------------------------------------------------------------------
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import { CommandOptions } from '@adonisjs/core/types/ace';
import Dataset from '#models/dataset';
import { DoiClient } from '#app/Library/Doi/DoiClient';
import env from '#start/env';
import logger from '@adonisjs/core/services/logger';
import { DateTime } from 'luxon';
import pLimit from 'p-limit';

export default class ListUpdateableDatacite extends BaseCommand {
    static commandName = 'list:updateable-datacite';
    static description = 'List all datasets that need DataCite DOI updates';

    public static needsApplication = true;

    // private chunkSize = 100; // Set chunk size for pagination

    @flags.boolean({ alias: 'v', description: 'Verbose output showing detailed information' })
    public verbose: boolean = false;

    @flags.boolean({ alias: 'c', description: 'Show only count of updatable datasets' })
    public countOnly: boolean = false;

    @flags.boolean({ alias: 'i', description: 'Show only publish IDs (useful for scripting)' })
    public idsOnly: boolean = false;

    @flags.number({ description: 'Chunk size for processing datasets (default: 50)' })
    public chunkSize: number = 50;

    //example: node ace list:updateable-datacite
    //example: node ace list:updateable-datacite --verbose
    //example: node ace list:updateable-datacite --count-only
    //example: node ace list:updateable-datacite --ids-only
    //example: node ace list:updateable-datacite --chunk-size 50

    public static options: CommandOptions = {
        startApp: true,
        stayAlive: false,
    };

    async run() {
        const prefix = env.get('DATACITE_PREFIX', '');
        const base_domain = env.get('BASE_DOMAIN', '');

        if (!prefix || !base_domain) {
            logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
            return;
        }

        // Prevent conflicting flags
        if ((this.verbose && this.countOnly) || (this.verbose && this.idsOnly)) {
            logger.error('Flags --verbose cannot be combined with --count-only or --ids-only');
            return;
        }

        const chunkSize = this.chunkSize || 50;
        let page = 1;
        let hasMoreDatasets = true;
        let totalProcessed = 0;
        const updatableDatasets: Dataset[] = [];

        if (!this.countOnly && !this.idsOnly) {
            logger.info(`Processing datasets in chunks of ${chunkSize}...`);
        }

        while (hasMoreDatasets) {
            const datasets = await this.getDatasets(page, chunkSize);

            if (datasets.length === 0) {
                hasMoreDatasets = false;
                break;
            }

            if (!this.countOnly && !this.idsOnly) {
                logger.info(`Processing chunk ${page} (${datasets.length} datasets)...`);
            }

            const chunkUpdatableDatasets = await this.processChunk(datasets);
            updatableDatasets.push(...chunkUpdatableDatasets);
            totalProcessed += datasets.length;

            page += 1;
            if (datasets.length < chunkSize) {
                hasMoreDatasets = false;
            }
        }

        if (!this.countOnly && !this.idsOnly) {
            logger.info(`Processed ${totalProcessed} datasets total, found ${updatableDatasets.length} that need updates`);
        }

        if (this.countOnly) {
            console.log(updatableDatasets.length);
        } else if (this.idsOnly) {
            updatableDatasets.forEach((dataset) => console.log(dataset.publish_id));
        } else if (this.verbose) {
            await this.showVerboseOutput(updatableDatasets);
        } else {
            this.showSimpleOutput(updatableDatasets);
        }
    }

    /**
     * Processes a chunk of datasets to determine which ones need DataCite updates
     *
     * This method handles parallel processing of datasets within a chunk, providing
     * efficient error handling and filtering of results.
     *
     * @param datasets - Array of Dataset objects to process
     * @returns Promise<Dataset[]> - Array of datasets that need updates
     */
    // private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
    //     // Process datasets in parallel using Promise.allSettled for better error handling
    //     //
    //     // Why Promise.allSettled vs Promise.all?
    //     // - Promise.all fails fast: if ANY promise rejects, the entire operation fails
    //     // - Promise.allSettled waits for ALL promises: some can fail, others succeed
    //     // - This is crucial for batch processing where we don't want one bad dataset
    //     //   to stop processing of the entire chunk
    //     const results = await Promise.allSettled(
    //         datasets.map(async (dataset) => {
    //             try {
    //                 // Check if this specific dataset needs a DataCite update
    //                 const needsUpdate = await this.shouldUpdateDataset(dataset);

    //                 // Return the dataset if it needs update, null if it doesn't
    //                 // This creates a sparse array that we'll filter later
    //                 return needsUpdate ? dataset : null;
    //             } catch (error) {
    //                 // Error handling for individual dataset checks
    //                 //
    //                 // Log warnings only if we're not in silent modes (count-only or ids-only)
    //                 // This prevents log spam when running automated scripts
    //                 if (!this.countOnly && !this.idsOnly) {
    //                     logger.warn(`Error checking dataset ${dataset.publish_id}: ${error.message}`);
    //                 }

    //                 // IMPORTANT DECISION: Return the dataset anyway if we can't determine status
    //                 //
    //                 // Why? It's safer to include a dataset that might not need updating
    //                 // than to miss one that actually does need updating. This follows the
    //                 // "fail-safe" principle - if we're unsure, err on the side of caution
    //                 return dataset;
    //             }
    //         }),
    //     );

    //     // Filter and extract results from Promise.allSettled response
    //     //
    //     // Promise.allSettled returns an array of objects with this structure:
    //     // - { status: 'fulfilled', value: T } for successful promises
    //     // - { status: 'rejected', reason: Error } for failed promises
    //     //
    //     // We need to:
    //     // 1. Only get fulfilled results (rejected ones are already handled above)
    //     // 2. Filter out null values (datasets that don't need updates)
    //     // 3. Extract the actual Dataset objects from the wrapper
    //     return results
    //         .filter(
    //             (result): result is PromiseFulfilledResult<Dataset | null> =>
    //                 // Type guard: only include fulfilled results that have actual values
    //                 // This filters out:
    //                 // - Rejected promises (shouldn't happen due to try/catch, but safety first)
    //                 // - Fulfilled promises that returned null (datasets that don't need updates)
    //                 result.status === 'fulfilled' && result.value !== null,
    //         )
    //         .map((result) => result.value!); // Extract the Dataset from the wrapper
    //     // The ! is safe here because we filtered out null values above
    // }

    private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
        // Limit concurrency to avoid API flooding (e.g., max 5 at once)
        const limit = pLimit(5);

        const tasks = datasets.map((dataset) =>
            limit(async () => {
                try {
                    const needsUpdate = await this.shouldUpdateDataset(dataset);
                    return needsUpdate ? dataset : null;
                } catch (error) {
                    if (!this.countOnly && !this.idsOnly) {
                        logger.warn(
                            `Error checking dataset ${dataset.publish_id}: ${
                                error instanceof Error ? error.message : JSON.stringify(error)
                            }`,
                        );
                    }
                    // Fail-safe: include dataset if uncertain
                    return dataset;
                }
            }),
        );

        const results = await Promise.allSettled(tasks);

        return results
            .filter((result): result is PromiseFulfilledResult<Dataset | null> => result.status === 'fulfilled' && result.value !== null)
            .map((result) => result.value!);
    }

    private async getDatasets(page: number, chunkSize: number): Promise<Dataset[]> {
        return await Dataset.query()
            .orderBy('publish_id', 'asc')
            .preload('identifier')
            .preload('xmlCache')
            .preload('titles')
            .where('server_state', 'published')
            .whereHas('identifier', (identifierQuery) => {
                identifierQuery.where('type', 'doi');
            })
            .forPage(page, chunkSize); // Get files for the current page
    }

    private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
        try {
            let doiIdentifier = dataset.identifier;
            if (!doiIdentifier) {
                await dataset.load('identifier');
                doiIdentifier = dataset.identifier;
            }

            if (!doiIdentifier || doiIdentifier.type !== 'doi') {
                return false;
            }

            const datasetModified =
                dataset.server_date_modified instanceof DateTime
                    ? dataset.server_date_modified
                    : DateTime.fromJSDate(dataset.server_date_modified);

            if (!datasetModified) {
                return true;
            }

            if (datasetModified > DateTime.now()) {
                return false;
            }

            const doiClient = new DoiClient();
            const DOI_CHECK_TIMEOUT = 300; // ms

            const doiLastModified = await Promise.race([
                doiClient.getDoiLastModified(doiIdentifier.value),
                this.createTimeoutPromise(DOI_CHECK_TIMEOUT),
            ]).catch(() => null);

            if (!doiLastModified) {
                // If uncertain, better include dataset for update
                return true;
            }

            const doiModified = DateTime.fromJSDate(doiLastModified);
            if (datasetModified > doiModified) {
                const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
                const toleranceSeconds = 600;
                return diffInSeconds > toleranceSeconds;
            }
            return false;
        } catch (error) {
            return true; // safer: include dataset if unsure
        }
    }

    /**
     * Create a timeout promise for API calls
     */
    private createTimeoutPromise(timeoutMs: number): Promise<never> {
        return new Promise((_, reject) => {
            setTimeout(() => reject(new Error(`API call timeout after ${timeoutMs}ms`)), timeoutMs);
        });
    }

    private showSimpleOutput(updatableDatasets: Dataset[]): void {
        if (updatableDatasets.length === 0) {
            console.log('No datasets need DataCite updates.');
            return;
        }

        console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);

        updatableDatasets.forEach((dataset) => {
            console.log(`publish_id  ${dataset.publish_id} needs update - ${dataset.mainTitle || 'Untitled'}`);
        });

        console.log(`\nTo update these datasets, run:`);
        console.log(`  node ace update:datacite`);
        console.log(`\nOr update specific datasets:`);
        console.log(`  node ace update:datacite -p <publish_id>`);
    }

    private async showVerboseOutput(updatableDatasets: Dataset[]): Promise<void> {
        if (updatableDatasets.length === 0) {
            console.log('No datasets need DataCite updates.');
            return;
        }

        console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);

        for (const dataset of updatableDatasets) {
            await this.showDatasetDetails(dataset);
        }

        console.log(`\nSummary: ${updatableDatasets.length} datasets need updates`);
    }

    private async showDatasetDetails(dataset: Dataset): Promise<void> {
        try {
            let doiIdentifier = dataset.identifier;

            if (!doiIdentifier) {
                await dataset.load('identifier');
                doiIdentifier = dataset.identifier;
            }

            const doiValue = doiIdentifier?.value || 'N/A';
            const datasetModified = dataset.server_date_modified;

            // Get DOI info from DataCite
            const doiClient = new DoiClient();
            const doiLastModified = await doiClient.getDoiLastModified(doiValue);
            const doiState = await doiClient.getDoiState(doiValue);

            console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
            console.log(`│ Title:               ${dataset.mainTitle || 'Untitled'}`);
            console.log(`│ DOI:                 ${doiValue}`);
            console.log(`│ DOI State:           ${doiState || 'Unknown'}`);
            console.log(`│ Dataset Modified:    ${datasetModified ? datasetModified.toISO() : 'N/A'}`);
            console.log(`│ DOI Modified:        ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}`);
            console.log(`│ Status:              NEEDS UPDATE`);
            console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
        } catch (error) {
            console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
            console.log(`│ Title:               ${dataset.mainTitle || 'Untitled'}`);
            console.log(`│ DOI:                 ${dataset.identifier?.value || 'N/A'}`);
            console.log(`│ Error:               ${error.message}`);
            console.log(`│ Status:              NEEDS UPDATE (Error checking)`);
            console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
        }
    }
}