feat: Enhance ClamAV Docker entrypoint and configuration

- Updated docker-entrypoint.sh to improve ClamAV service initialization and logging. - Added checks for ClamAV and freshclam daemon status. - Optimized freshclam configuration for container usage, including logging to stdout and setting database directory. - Introduced caching mechanism for enabled file extensions in vinejs_provider.ts to reduce database queries. - Implemented a new command to list datasets needing DataCite DOI updates, with options for verbose output, count only, and IDs only. - Updated package dependencies to include p-limit and pino-pretty. - finalized ace command 'detect:missing-cross-references'
2025-09-26 12:19:35 +02:00 · 2025-09-26 12:19:35 +02:00 · 6757bdb77c
commit 6757bdb77c
parent 4c8cce27da
10 changed files with 745 additions and 430 deletions
--- a/commands/fix_dataset_cross_references.ts
+++ b/commands/fix_dataset_cross_references.ts
@ -6,6 +6,7 @@
 */
 import { BaseCommand, flags } from '@adonisjs/core/ace';
 import type { CommandOptions } from '@adonisjs/core/types/ace';
+import { DateTime } from 'luxon';
 import Dataset from '#models/dataset';
 import DatasetReference from '#models/dataset_reference';
 // import env from '#start/env';
@ -15,6 +16,8 @@ interface MissingCrossReference {
    targetDatasetId: number;
    sourcePublishId: number | null;
    targetPublishId: number | null;
+    sourceDoi: string | null;
+    targetDoi: string | null;
    referenceType: string;
    relation: string;
    doi: string | null;
@ -33,30 +36,58 @@ export default class DetectMissingCrossReferences extends BaseCommand {
    @flags.boolean({ alias: 'v', description: 'Verbose output' })
    public verbose: boolean = false;

+    @flags.number({ alias: 'p', description: 'Filter by specific publish_id (source or target dataset)' })
+    public publish_id?: number;
+
+    // example: node ace detect:missing-cross-references --verbose -p 227 //if you want to filter by specific publish_id with details
+    // example: node ace detect:missing-cross-references --verbose
+    // example: node ace detect:missing-cross-references --fix -p 227 //if you want to filter by specific publish_id and fix it
+    // example: node ace detect:missing-cross-references
+
    public static options: CommandOptions = {
        startApp: true,
        staysAlive: false,
    };

+    // Define the allowed relations that we want to process
+    private readonly ALLOWED_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf', 'IsVariantFormOf', 'IsOriginalFormOf'];
+
    async run() {
        this.logger.info('🔍 Detecting missing cross-references...');
+        this.logger.info(`📋 Processing only these relations: ${this.ALLOWED_RELATIONS.join(', ')}`);
+
+        if (this.publish_id) {
+            this.logger.info(`Filtering by publish_id: ${this.publish_id}`);
+        }

        try {
            const missingReferences = await this.findMissingCrossReferences();

            if (missingReferences.length === 0) {
-                this.logger.success('All cross-references are properly linked!');
+                const filterMsg = this.publish_id ? ` for publish_id ${this.publish_id}` : '';
+                this.logger.success(`All cross-references are properly linked for the specified relations${filterMsg}!`);
                return;
            }

-            this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s):`);
+            const filterMsg = this.publish_id ? ` (filtered by publish_id ${this.publish_id})` : '';
+            this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s)${filterMsg}:`);

-            for (const missing of missingReferences) {
-                this.logger.info(
-                    `Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
-                );
+            // Show brief list if not verbose mode
+            if (!this.verbose) {
+                for (const missing of missingReferences) {
+                    const sourceDoi = missing.sourceDoi ? ` DOI: ${missing.sourceDoi}` : '';
+                    const targetDoi = missing.targetDoi ? ` DOI: ${missing.targetDoi}` : '';

-                if (this.verbose) {
+                    this.logger.info(
+                        `Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}${sourceDoi}) ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId}${targetDoi}) → missing reverse: ${missing.reverseRelation}`,
+                    );
+                }
+            } else {
+                // Verbose mode - show detailed info
+                for (const missing of missingReferences) {
+                    this.logger.info(
+                        `Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
+                    );
                    this.logger.info(`  - Reference type: ${missing.referenceType}`);
                    this.logger.info(`  - Relation: ${missing.relation}`);
                    this.logger.info(`  - DOI: ${missing.doi}`);
@ -67,20 +98,28 @@ export default class DetectMissingCrossReferences extends BaseCommand {
                await this.fixMissingReferences(missingReferences);
                this.logger.success('All missing cross-references have been fixed!');
            } else {
-                this.printMissingReferencesList(missingReferences);
+                if (this.verbose) {
+                    this.printMissingReferencesList(missingReferences);
+                }
                this.logger.info('💡 Run with --fix flag to automatically create missing cross-references');
+                if (this.publish_id) {
+                    this.logger.info(`🎯 Currently filtering by publish_id: ${this.publish_id}`);
+                }
            }
        } catch (error) {
            this.logger.error('Error detecting missing cross-references:', error);
            process.exit(1);
        }
    }
+
    private async findMissingCrossReferences(): Promise<MissingCrossReference[]> {
        const missingReferences: {
            sourceDatasetId: number;
            targetDatasetId: number;
            sourcePublishId: number | null;
            targetPublishId: number | null;
+            sourceDoi: string | null;
+            targetDoi: string | null;
            referenceType: string;
            relation: string;
            doi: string | null;
@ -90,22 +129,32 @@ export default class DetectMissingCrossReferences extends BaseCommand {
        this.logger.info('📊 Querying dataset references...');

        // Find all references that point to Tethys datasets (DOI or URL containing tethys DOI)
-        // Only from datasets that are published
-        const tethysReferences = await DatasetReference.query()
+        // Only from datasets that are published AND only for allowed relations
+        const tethysReferencesQuery = DatasetReference.query()
            .whereIn('type', ['DOI', 'URL'])
+            .whereIn('relation', this.ALLOWED_RELATIONS) // Only process allowed relations
            .where((query) => {
                query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
            })
            .preload('dataset', (datasetQuery) => {
-                datasetQuery.where('server_state', 'published');
+                datasetQuery.preload('identifier');
            })
            .whereHas('dataset', (datasetQuery) => {
                datasetQuery.where('server_state', 'published');
            });
+        if (typeof this.publish_id === 'number') {
+            tethysReferencesQuery.whereHas('dataset', (datasetQuery) => {
+                datasetQuery.where('publish_id', this.publish_id as number);
+            });
+        }

-        this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets`);
+        const tethysReferences = await tethysReferencesQuery.exec();
+
+        this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets (allowed relations only)`);

        let processedCount = 0;
+        let skippedCount = 0;
+
        for (const reference of tethysReferences) {
            processedCount++;

@ -113,6 +162,15 @@ export default class DetectMissingCrossReferences extends BaseCommand {
                this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`);
            }

+            // Double-check that this relation is in our allowed list (safety check)
+            if (!this.ALLOWED_RELATIONS.includes(reference.relation)) {
+                skippedCount++;
+                if (this.verbose) {
+                    this.logger.info(`⏭️  Skipping relation "${reference.relation}" - not in allowed list`);
+                }
+                continue;
+            }
+
            // Extract dataset publish_id from DOI or URL
            const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value);

@ -127,6 +185,7 @@ export default class DetectMissingCrossReferences extends BaseCommand {
            const targetDataset = await Dataset.query()
                .where('publish_id', targetDatasetPublish)
                .where('server_state', 'published')
+                .preload('identifier')
                .first();

            if (!targetDataset) {
@ -145,25 +204,31 @@ export default class DetectMissingCrossReferences extends BaseCommand {
            // Check if reverse reference exists
            const reverseReferenceExists = await this.checkReverseReferenceExists(
                targetDataset.id,
-                reference.document_id,
+                // reference.document_id,
                reference.relation,
            );

            if (!reverseReferenceExists) {
-                missingReferences.push({
-                    sourceDatasetId: reference.document_id,
-                    targetDatasetId: targetDataset.id,
-                    sourcePublishId: reference.dataset.publish_id || null,
-                    targetPublishId: targetDataset.publish_id || null,
-                    referenceType: reference.type,
-                    relation: reference.relation,
-                    doi: reference.value,
-                    reverseRelation: this.getReverseRelation(reference.relation),
-                });
+                const reverseRelation = this.getReverseRelation(reference.relation);
+                if (reverseRelation) {
+                    // Only add if we have a valid reverse relation
+                    missingReferences.push({
+                        sourceDatasetId: reference.document_id,
+                        targetDatasetId: targetDataset.id,
+                        sourcePublishId: reference.dataset.publish_id || null,
+                        targetPublishId: targetDataset.publish_id || null,
+                        referenceType: reference.type,
+                        relation: reference.relation,
+                        doi: reference.value,
+                        reverseRelation: reverseRelation,
+                        sourceDoi: reference.dataset.identifier ? reference.dataset.identifier.value : null,
+                        targetDoi: targetDataset.identifier ? targetDataset.identifier.value : null,
+                    });
+                }
            }
        }

-        this.logger.info(`✅ Processed all ${processedCount} references`);
+        this.logger.info(`✅ Processed ${processedCount} references (${skippedCount} skipped due to relation filtering)`);
        return missingReferences;
    }

@ -183,64 +248,47 @@ export default class DetectMissingCrossReferences extends BaseCommand {
        return null;
    }

-    private async checkReverseReferenceExists(
-        sourceDatasetId: number,
-        targetDatasetId: number,
-        originalRelation: string,
-    ): Promise<boolean> {
+    private async checkReverseReferenceExists(targetDatasetId: number, originalRelation: string): Promise<boolean> {
        const reverseRelation = this.getReverseRelation(originalRelation);

+        if (!reverseRelation) {
+            return true; // If no reverse relation is defined, consider it as "exists" to skip processing
+        }
+
        // Only check for reverse references where the source dataset is also published
        const reverseReference = await DatasetReference.query()
-            .where('document_id', sourceDatasetId)
+            // We don't filter by source document_id here to find any incoming reference from any published dataset
+            // .where('document_id', sourceDatasetId)
            .where('related_document_id', targetDatasetId)
            .where('relation', reverseRelation)
-            .whereHas('dataset', (datasetQuery) => {
-                datasetQuery.where('server_state', 'published');
-            })
            .first();

        return !!reverseReference;
    }

-    private getReverseRelation(relation: string): string {
+    private getReverseRelation(relation: string): string | null {
        const relationMap: Record<string, string> = {
            IsNewVersionOf: 'IsPreviousVersionOf',
            IsPreviousVersionOf: 'IsNewVersionOf',
-            
-            IsVersionOf: 'HasVersion',
-            HasVersion: 'IsVersionOf',
-
-            Compiles: 'IsCompiledBy',
-            IsCompiledBy: 'Compiles',
-
            IsVariantFormOf: 'IsOriginalFormOf',
            IsOriginalFormOf: 'IsVariantFormOf',
-
-            IsPartOf: 'HasPart',
-            HasPart: 'IsPartOf',
-
-            IsSupplementTo: 'IsSupplementedBy',
-            IsSupplementedBy: 'IsSupplementTo',
-
-            Continues: 'IsContinuedBy',
-            IsContinuedBy: 'Continues',
        };

-        // to catch relation types like 'compiles' or 'IsVariantFormOf' that are not in the map mark reverse as 'HasVersion'
-        return relationMap[relation] || 'HasVersion'; // Default fallback
+        // Only return reverse relation if it exists in our map, otherwise return null
+        return relationMap[relation] || null;
    }

    private printMissingReferencesList(missingReferences: MissingCrossReference[]) {
        console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
        console.log('│                         MISSING CROSS-REFERENCES REPORT                        │');
-        console.log('│                            (Published Datasets Only)                           │');
+        console.log('│                     (Published Datasets Only - Filtered Relations)            │');
        console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
        console.log();

        missingReferences.forEach((missing, index) => {
            console.log(
-                `${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}) → Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId})`,
+                `${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId} Identifier: ${missing.sourceDoi}) 
+                ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId} Identifier: ${missing.targetDoi})`,
            );
            console.log(`   ├─ Current relation: "${missing.relation}"`);
            console.log(`   ├─ Missing reverse relation: "${missing.reverseRelation}"`);
@ -251,6 +299,7 @@ export default class DetectMissingCrossReferences extends BaseCommand {

        console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
        console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected                      │`);
+        console.log(`│ Processed relations: ${this.ALLOWED_RELATIONS.join(', ')}                           │`);
        console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
    }

@ -262,27 +311,37 @@ export default class DetectMissingCrossReferences extends BaseCommand {

        for (const [index, missing] of missingReferences.entries()) {
            try {
-                // Get the source dataset to create proper reference - ensure it's published
+                // Get both source and target datasets
                const sourceDataset = await Dataset.query()
                    .where('id', missing.sourceDatasetId)
                    .where('server_state', 'published')
                    .preload('identifier')
                    .first();

+                const targetDataset = await Dataset.query().where('id', missing.targetDatasetId).where('server_state', 'published').first();
+
                if (!sourceDataset) {
                    this.logger.warning(`⚠️  Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`);
                    errorCount++;
                    continue;
                }

-                // Create the reverse reference
+                if (!targetDataset) {
+                    this.logger.warning(`⚠️  Target dataset ${missing.targetDatasetId} not found or not published, skipping...`);
+                    errorCount++;
+                    continue;
+                }
+
+                // Create the reverse reference using the referenced_by relationship
+                // Example: If Dataset 297 IsNewVersionOf Dataset 144
+                // We create an incoming reference for Dataset 144 that shows Dataset 297 IsPreviousVersionOf it
                const reverseReference = new DatasetReference();
-                reverseReference.document_id = missing.targetDatasetId;
-                reverseReference.related_document_id = missing.sourceDatasetId;
+                // Don't set document_id - this creates an incoming reference via related_document_id
+                reverseReference.related_document_id = missing.targetDatasetId; // 144 (dataset receiving the incoming reference)
                reverseReference.type = 'DOI';
                reverseReference.relation = missing.reverseRelation;

-                // Use the source dataset's DOI for the value
+                // Use the source dataset's DOI for the value (what's being referenced)
                if (sourceDataset.identifier?.value) {
                    reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`;
                } else {
@ -293,12 +352,16 @@ export default class DetectMissingCrossReferences extends BaseCommand {
                // Use the source dataset's main title for the label
                reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`;

+                // Also save 'server_date_modified' on target dataset to trigger any downstream updates (e.g. search index)
+                targetDataset.server_date_modified = DateTime.now();
+                await targetDataset.save();
+
                await reverseReference.save();
                fixedCount++;

                if (this.verbose) {
                    this.logger.info(
-                        `✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.targetDatasetId} -> ${missing.sourceDatasetId}`,
+                        `✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.sourceDatasetId} -> ${missing.targetDatasetId} (${missing.reverseRelation})`,
                    );
                } else if ((index + 1) % 10 === 0) {
                    this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`);
--- a/commands/list_updatable_datacite.ts
+++ b/commands/list_updatable_datacite.ts
@ -0,0 +1,346 @@
+/*
+|--------------------------------------------------------------------------
+| node ace make:command list-updateable-datacite
+| DONE:    create commands/list_updeatable_datacite.ts
+|--------------------------------------------------------------------------
+*/
+import { BaseCommand, flags } from '@adonisjs/core/ace';
+import { CommandOptions } from '@adonisjs/core/types/ace';
+import Dataset from '#models/dataset';
+import { DoiClient } from '#app/Library/Doi/DoiClient';
+import env from '#start/env';
+import logger from '@adonisjs/core/services/logger';
+import { DateTime } from 'luxon';
+import pLimit from 'p-limit';
+
+export default class ListUpdateableDatacite extends BaseCommand {
+    static commandName = 'list:updateable-datacite';
+    static description = 'List all datasets that need DataCite DOI updates';
+
+    public static needsApplication = true;
+
+    // private chunkSize = 100; // Set chunk size for pagination
+
+    @flags.boolean({ alias: 'v', description: 'Verbose output showing detailed information' })
+    public verbose: boolean = false;
+
+    @flags.boolean({ alias: 'c', description: 'Show only count of updatable datasets' })
+    public countOnly: boolean = false;
+
+    @flags.boolean({ alias: 'i', description: 'Show only publish IDs (useful for scripting)' })
+    public idsOnly: boolean = false;
+
+    @flags.number({ description: 'Chunk size for processing datasets (default: 50)' })
+    public chunkSize: number = 50;
+
+    //example: node ace list:updateable-datacite
+    //example: node ace list:updateable-datacite --verbose
+    //example: node ace list:updateable-datacite --count-only
+    //example: node ace list:updateable-datacite --ids-only
+    //example: node ace list:updateable-datacite --chunk-size 50
+
+    public static options: CommandOptions = {
+        startApp: true,
+        stayAlive: false,
+    };
+
+    async run() {
+        const prefix = env.get('DATACITE_PREFIX', '');
+        const base_domain = env.get('BASE_DOMAIN', '');
+
+        if (!prefix || !base_domain) {
+            logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
+            return;
+        }
+
+        // Prevent conflicting flags
+        if ((this.verbose && this.countOnly) || (this.verbose && this.idsOnly)) {
+            logger.error('Flags --verbose cannot be combined with --count-only or --ids-only');
+            return;
+        }
+
+        const chunkSize = this.chunkSize || 50;
+        let page = 1;
+        let hasMoreDatasets = true;
+        let totalProcessed = 0;
+        const updatableDatasets: Dataset[] = [];
+
+        if (!this.countOnly && !this.idsOnly) {
+            logger.info(`Processing datasets in chunks of ${chunkSize}...`);
+        }
+
+        while (hasMoreDatasets) {
+            const datasets = await this.getDatasets(page, chunkSize);
+
+            if (datasets.length === 0) {
+                hasMoreDatasets = false;
+                break;
+            }
+
+            if (!this.countOnly && !this.idsOnly) {
+                logger.info(`Processing chunk ${page} (${datasets.length} datasets)...`);
+            }
+
+            const chunkUpdatableDatasets = await this.processChunk(datasets);
+            updatableDatasets.push(...chunkUpdatableDatasets);
+            totalProcessed += datasets.length;
+
+            page += 1;
+            if (datasets.length < chunkSize) {
+                hasMoreDatasets = false;
+            }
+        }
+
+        if (!this.countOnly && !this.idsOnly) {
+            logger.info(`Processed ${totalProcessed} datasets total, found ${updatableDatasets.length} that need updates`);
+        }
+
+        if (this.countOnly) {
+            console.log(updatableDatasets.length);
+        } else if (this.idsOnly) {
+            updatableDatasets.forEach((dataset) => console.log(dataset.publish_id));
+        } else if (this.verbose) {
+            await this.showVerboseOutput(updatableDatasets);
+        } else {
+            this.showSimpleOutput(updatableDatasets);
+        }
+    }
+
+    /**
+     * Processes a chunk of datasets to determine which ones need DataCite updates
+     *
+     * This method handles parallel processing of datasets within a chunk, providing
+     * efficient error handling and filtering of results.
+     *
+     * @param datasets - Array of Dataset objects to process
+     * @returns Promise<Dataset[]> - Array of datasets that need updates
+     */
+    // private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
+    //     // Process datasets in parallel using Promise.allSettled for better error handling
+    //     //
+    //     // Why Promise.allSettled vs Promise.all?
+    //     // - Promise.all fails fast: if ANY promise rejects, the entire operation fails
+    //     // - Promise.allSettled waits for ALL promises: some can fail, others succeed
+    //     // - This is crucial for batch processing where we don't want one bad dataset
+    //     //   to stop processing of the entire chunk
+    //     const results = await Promise.allSettled(
+    //         datasets.map(async (dataset) => {
+    //             try {
+    //                 // Check if this specific dataset needs a DataCite update
+    //                 const needsUpdate = await this.shouldUpdateDataset(dataset);
+
+    //                 // Return the dataset if it needs update, null if it doesn't
+    //                 // This creates a sparse array that we'll filter later
+    //                 return needsUpdate ? dataset : null;
+    //             } catch (error) {
+    //                 // Error handling for individual dataset checks
+    //                 //
+    //                 // Log warnings only if we're not in silent modes (count-only or ids-only)
+    //                 // This prevents log spam when running automated scripts
+    //                 if (!this.countOnly && !this.idsOnly) {
+    //                     logger.warn(`Error checking dataset ${dataset.publish_id}: ${error.message}`);
+    //                 }
+
+    //                 // IMPORTANT DECISION: Return the dataset anyway if we can't determine status
+    //                 //
+    //                 // Why? It's safer to include a dataset that might not need updating
+    //                 // than to miss one that actually does need updating. This follows the
+    //                 // "fail-safe" principle - if we're unsure, err on the side of caution
+    //                 return dataset;
+    //             }
+    //         }),
+    //     );
+
+    //     // Filter and extract results from Promise.allSettled response
+    //     //
+    //     // Promise.allSettled returns an array of objects with this structure:
+    //     // - { status: 'fulfilled', value: T } for successful promises
+    //     // - { status: 'rejected', reason: Error } for failed promises
+    //     //
+    //     // We need to:
+    //     // 1. Only get fulfilled results (rejected ones are already handled above)
+    //     // 2. Filter out null values (datasets that don't need updates)
+    //     // 3. Extract the actual Dataset objects from the wrapper
+    //     return results
+    //         .filter(
+    //             (result): result is PromiseFulfilledResult<Dataset | null> =>
+    //                 // Type guard: only include fulfilled results that have actual values
+    //                 // This filters out:
+    //                 // - Rejected promises (shouldn't happen due to try/catch, but safety first)
+    //                 // - Fulfilled promises that returned null (datasets that don't need updates)
+    //                 result.status === 'fulfilled' && result.value !== null,
+    //         )
+    //         .map((result) => result.value!); // Extract the Dataset from the wrapper
+    //     // The ! is safe here because we filtered out null values above
+    // }
+
+    private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
+        // Limit concurrency to avoid API flooding (e.g., max 5 at once)
+        const limit = pLimit(5);
+
+        const tasks = datasets.map((dataset) =>
+            limit(async () => {
+                try {
+                    const needsUpdate = await this.shouldUpdateDataset(dataset);
+                    return needsUpdate ? dataset : null;
+                } catch (error) {
+                    if (!this.countOnly && !this.idsOnly) {
+                        logger.warn(
+                            `Error checking dataset ${dataset.publish_id}: ${
+                                error instanceof Error ? error.message : JSON.stringify(error)
+                            }`,
+                        );
+                    }
+                    // Fail-safe: include dataset if uncertain
+                    return dataset;
+                }
+            }),
+        );
+
+        const results = await Promise.allSettled(tasks);
+
+        return results
+            .filter((result): result is PromiseFulfilledResult<Dataset | null> => result.status === 'fulfilled' && result.value !== null)
+            .map((result) => result.value!);
+    }
+
+    private async getDatasets(page: number, chunkSize: number): Promise<Dataset[]> {
+        return await Dataset.query()
+            .orderBy('publish_id', 'asc')
+            .preload('identifier')
+            .preload('xmlCache')
+            .preload('titles')
+            .where('server_state', 'published')
+            .whereHas('identifier', (identifierQuery) => {
+                identifierQuery.where('type', 'doi');
+            })
+            .forPage(page, chunkSize); // Get files for the current page
+    }
+
+    private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
+        try {
+            let doiIdentifier = dataset.identifier;
+            if (!doiIdentifier) {
+                await dataset.load('identifier');
+                doiIdentifier = dataset.identifier;
+            }
+
+            if (!doiIdentifier || doiIdentifier.type !== 'doi') {
+                return false;
+            }
+
+            const datasetModified =
+                dataset.server_date_modified instanceof DateTime
+                    ? dataset.server_date_modified
+                    : DateTime.fromJSDate(dataset.server_date_modified);
+
+            if (!datasetModified) {
+                return true;
+            }
+
+            if (datasetModified > DateTime.now()) {
+                return false;
+            }
+
+            const doiClient = new DoiClient();
+            const DOI_CHECK_TIMEOUT = 300; // ms
+
+            const doiLastModified = await Promise.race([
+                doiClient.getDoiLastModified(doiIdentifier.value),
+                this.createTimeoutPromise(DOI_CHECK_TIMEOUT),
+            ]).catch(() => null);
+
+            if (!doiLastModified) {
+                // If uncertain, better include dataset for update
+                return true;
+            }
+
+            const doiModified = DateTime.fromJSDate(doiLastModified);
+            if (datasetModified > doiModified) {
+                const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
+                const toleranceSeconds = 600;
+                return diffInSeconds > toleranceSeconds;
+            }
+            return false;
+        } catch (error) {
+            return true; // safer: include dataset if unsure
+        }
+    }
+
+    /**
+     * Create a timeout promise for API calls
+     */
+    private createTimeoutPromise(timeoutMs: number): Promise<never> {
+        return new Promise((_, reject) => {
+            setTimeout(() => reject(new Error(`API call timeout after ${timeoutMs}ms`)), timeoutMs);
+        });
+    }
+
+    private showSimpleOutput(updatableDatasets: Dataset[]): void {
+        if (updatableDatasets.length === 0) {
+            console.log('No datasets need DataCite updates.');
+            return;
+        }
+
+        console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
+
+        updatableDatasets.forEach((dataset) => {
+            console.log(`publish_id  ${dataset.publish_id} needs update - ${dataset.mainTitle || 'Untitled'}`);
+        });
+
+        console.log(`\nTo update these datasets, run:`);
+        console.log(`  node ace update:datacite`);
+        console.log(`\nOr update specific datasets:`);
+        console.log(`  node ace update:datacite -p <publish_id>`);
+    }
+
+    private async showVerboseOutput(updatableDatasets: Dataset[]): Promise<void> {
+        if (updatableDatasets.length === 0) {
+            console.log('No datasets need DataCite updates.');
+            return;
+        }
+
+        console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
+
+        for (const dataset of updatableDatasets) {
+            await this.showDatasetDetails(dataset);
+        }
+
+        console.log(`\nSummary: ${updatableDatasets.length} datasets need updates`);
+    }
+
+    private async showDatasetDetails(dataset: Dataset): Promise<void> {
+        try {
+            let doiIdentifier = dataset.identifier;
+
+            if (!doiIdentifier) {
+                await dataset.load('identifier');
+                doiIdentifier = dataset.identifier;
+            }
+
+            const doiValue = doiIdentifier?.value || 'N/A';
+            const datasetModified = dataset.server_date_modified;
+
+            // Get DOI info from DataCite
+            const doiClient = new DoiClient();
+            const doiLastModified = await doiClient.getDoiLastModified(doiValue);
+            const doiState = await doiClient.getDoiState(doiValue);
+
+            console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
+            console.log(`│ Title:               ${dataset.mainTitle || 'Untitled'}`);
+            console.log(`│ DOI:                 ${doiValue}`);
+            console.log(`│ DOI State:           ${doiState || 'Unknown'}`);
+            console.log(`│ Dataset Modified:    ${datasetModified ? datasetModified.toISO() : 'N/A'}`);
+            console.log(`│ DOI Modified:        ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}`);
+            console.log(`│ Status:              NEEDS UPDATE`);
+            console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
+        } catch (error) {
+            console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
+            console.log(`│ Title:               ${dataset.mainTitle || 'Untitled'}`);
+            console.log(`│ DOI:                 ${dataset.identifier?.value || 'N/A'}`);
+            console.log(`│ Error:               ${error.message}`);
+            console.log(`│ Status:              NEEDS UPDATE (Error checking)`);
+            console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
+        }
+    }
+}
--- a/commands/update_datacite.ts
+++ b/commands/update_datacite.ts
@ -122,58 +122,53 @@ export default class UpdateDatacite extends BaseCommand {

    private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
        try {
-            // Check if dataset has a DOI identifier (HasOne relationship)
            let doiIdentifier = dataset.identifier;

            if (!doiIdentifier) {
-                // Try to load the relationship if not already loaded
                await dataset.load('identifier');
                doiIdentifier = dataset.identifier;
            }

            if (!doiIdentifier || doiIdentifier.type !== 'doi') {
-                logger.warn(`Dataset ${dataset.publish_id}: No DOI identifier found`);
                return false;
            }

-            // Validate dataset modification date
            const datasetModified = dataset.server_date_modified;
            const now = DateTime.now();

            if (!datasetModified) {
-                logger.error(`Dataset ${dataset.publish_id}: server_date_modified is null or undefined`);
-                return true; // Update anyway if modification date is missing
+                return true; // Update if modification date is missing
            }

            if (datasetModified > now) {
-                logger.error(
-                    `Dataset ${dataset.publish_id}: server_date_modified (${datasetModified.toISO()}) is in the future! ` +
-                        `Current time: ${now.toISO()}. This indicates a data integrity issue. Skipping update.`,
-                );
-                return false; // Do not update when modification date is invalid
+                return false; // Skip invalid future dates
            }

-            // Get DOI information from DataCite using DoiClient
+            // Check DataCite DOI modification date
            const doiClient = new DoiClient();
            const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);

            if (!doiLastModified) {
-                logger.warn(`Dataset ${dataset.publish_id}: Could not retrieve DOI modification date from DataCite`);
-                return true; // Update anyway if we can't get DOI info
+                return false; // not Update if we can't get DOI info
            }

-            // Compare dataset modification date with DOI modification date
            const doiModified = DateTime.fromJSDate(doiLastModified);
+            if (datasetModified > doiModified) {
+                // if dataset was modified after DOI creation 
+                // Calculate the difference in seconds
+                const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);

-            logger.debug(
-                `Dataset ${dataset.publish_id}: Dataset modified: ${datasetModified.toISO()}, DOI modified: ${doiModified.toISO()}`,
-            );
+                // Define tolerance threshold (60 seconds = 1 minute)
+                const toleranceSeconds = 60;

-            // Update if dataset was modified after the DOI record
-            return datasetModified > doiModified;
+                // Only update if the difference is greater than the tolerance
+                // This prevents unnecessary updates for minor timestamp differences
+                return diffInSeconds > toleranceSeconds;
+            } else {
+                return false; // No update needed
+            }
        } catch (error) {
-            logger.warn(`Error checking update status for dataset ${dataset.publish_id}: ${error.message}`);
-            return true; // Update anyway if we can't determine status
+            return false; // not update if we can't determine status or other error
        }
    }