feat: Enhance ClamAV Docker entrypoint and configuration
- Updated docker-entrypoint.sh to improve ClamAV service initialization and logging. - Added checks for ClamAV and freshclam daemon status. - Optimized freshclam configuration for container usage, including logging to stdout and setting database directory. - Introduced caching mechanism for enabled file extensions in vinejs_provider.ts to reduce database queries. - Implemented a new command to list datasets needing DataCite DOI updates, with options for verbose output, count only, and IDs only. - Updated package dependencies to include p-limit and pino-pretty. - finalized ace command 'detect:missing-cross-references'
This commit is contained in:
parent
4c8cce27da
commit
6757bdb77c
10 changed files with 745 additions and 430 deletions
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
||||
import type { CommandOptions } from '@adonisjs/core/types/ace';
|
||||
import { DateTime } from 'luxon';
|
||||
import Dataset from '#models/dataset';
|
||||
import DatasetReference from '#models/dataset_reference';
|
||||
// import env from '#start/env';
|
||||
|
|
@ -15,6 +16,8 @@ interface MissingCrossReference {
|
|||
targetDatasetId: number;
|
||||
sourcePublishId: number | null;
|
||||
targetPublishId: number | null;
|
||||
sourceDoi: string | null;
|
||||
targetDoi: string | null;
|
||||
referenceType: string;
|
||||
relation: string;
|
||||
doi: string | null;
|
||||
|
|
@ -33,30 +36,58 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
@flags.boolean({ alias: 'v', description: 'Verbose output' })
|
||||
public verbose: boolean = false;
|
||||
|
||||
@flags.number({ alias: 'p', description: 'Filter by specific publish_id (source or target dataset)' })
|
||||
public publish_id?: number;
|
||||
|
||||
// example: node ace detect:missing-cross-references --verbose -p 227 //if you want to filter by specific publish_id with details
|
||||
// example: node ace detect:missing-cross-references --verbose
|
||||
// example: node ace detect:missing-cross-references --fix -p 227 //if you want to filter by specific publish_id and fix it
|
||||
// example: node ace detect:missing-cross-references
|
||||
|
||||
public static options: CommandOptions = {
|
||||
startApp: true,
|
||||
staysAlive: false,
|
||||
};
|
||||
|
||||
// Define the allowed relations that we want to process
|
||||
private readonly ALLOWED_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf', 'IsVariantFormOf', 'IsOriginalFormOf'];
|
||||
|
||||
async run() {
|
||||
this.logger.info('🔍 Detecting missing cross-references...');
|
||||
this.logger.info(`📋 Processing only these relations: ${this.ALLOWED_RELATIONS.join(', ')}`);
|
||||
|
||||
if (this.publish_id) {
|
||||
this.logger.info(`Filtering by publish_id: ${this.publish_id}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const missingReferences = await this.findMissingCrossReferences();
|
||||
|
||||
if (missingReferences.length === 0) {
|
||||
this.logger.success('All cross-references are properly linked!');
|
||||
const filterMsg = this.publish_id ? ` for publish_id ${this.publish_id}` : '';
|
||||
this.logger.success(`All cross-references are properly linked for the specified relations${filterMsg}!`);
|
||||
return;
|
||||
}
|
||||
|
||||
this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s):`);
|
||||
const filterMsg = this.publish_id ? ` (filtered by publish_id ${this.publish_id})` : '';
|
||||
this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s)${filterMsg}:`);
|
||||
|
||||
for (const missing of missingReferences) {
|
||||
this.logger.info(
|
||||
`Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
|
||||
);
|
||||
// Show brief list if not verbose mode
|
||||
if (!this.verbose) {
|
||||
for (const missing of missingReferences) {
|
||||
const sourceDoi = missing.sourceDoi ? ` DOI: ${missing.sourceDoi}` : '';
|
||||
const targetDoi = missing.targetDoi ? ` DOI: ${missing.targetDoi}` : '';
|
||||
|
||||
if (this.verbose) {
|
||||
this.logger.info(
|
||||
`Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}${sourceDoi}) ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId}${targetDoi}) → missing reverse: ${missing.reverseRelation}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Verbose mode - show detailed info
|
||||
for (const missing of missingReferences) {
|
||||
this.logger.info(
|
||||
`Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
|
||||
);
|
||||
this.logger.info(` - Reference type: ${missing.referenceType}`);
|
||||
this.logger.info(` - Relation: ${missing.relation}`);
|
||||
this.logger.info(` - DOI: ${missing.doi}`);
|
||||
|
|
@ -67,20 +98,28 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
await this.fixMissingReferences(missingReferences);
|
||||
this.logger.success('All missing cross-references have been fixed!');
|
||||
} else {
|
||||
this.printMissingReferencesList(missingReferences);
|
||||
if (this.verbose) {
|
||||
this.printMissingReferencesList(missingReferences);
|
||||
}
|
||||
this.logger.info('💡 Run with --fix flag to automatically create missing cross-references');
|
||||
if (this.publish_id) {
|
||||
this.logger.info(`🎯 Currently filtering by publish_id: ${this.publish_id}`);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error('Error detecting missing cross-references:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
private async findMissingCrossReferences(): Promise<MissingCrossReference[]> {
|
||||
const missingReferences: {
|
||||
sourceDatasetId: number;
|
||||
targetDatasetId: number;
|
||||
sourcePublishId: number | null;
|
||||
targetPublishId: number | null;
|
||||
sourceDoi: string | null;
|
||||
targetDoi: string | null;
|
||||
referenceType: string;
|
||||
relation: string;
|
||||
doi: string | null;
|
||||
|
|
@ -90,22 +129,32 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
this.logger.info('📊 Querying dataset references...');
|
||||
|
||||
// Find all references that point to Tethys datasets (DOI or URL containing tethys DOI)
|
||||
// Only from datasets that are published
|
||||
const tethysReferences = await DatasetReference.query()
|
||||
// Only from datasets that are published AND only for allowed relations
|
||||
const tethysReferencesQuery = DatasetReference.query()
|
||||
.whereIn('type', ['DOI', 'URL'])
|
||||
.whereIn('relation', this.ALLOWED_RELATIONS) // Only process allowed relations
|
||||
.where((query) => {
|
||||
query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
|
||||
})
|
||||
.preload('dataset', (datasetQuery) => {
|
||||
datasetQuery.where('server_state', 'published');
|
||||
datasetQuery.preload('identifier');
|
||||
})
|
||||
.whereHas('dataset', (datasetQuery) => {
|
||||
datasetQuery.where('server_state', 'published');
|
||||
});
|
||||
if (typeof this.publish_id === 'number') {
|
||||
tethysReferencesQuery.whereHas('dataset', (datasetQuery) => {
|
||||
datasetQuery.where('publish_id', this.publish_id as number);
|
||||
});
|
||||
}
|
||||
|
||||
this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets`);
|
||||
const tethysReferences = await tethysReferencesQuery.exec();
|
||||
|
||||
this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets (allowed relations only)`);
|
||||
|
||||
let processedCount = 0;
|
||||
let skippedCount = 0;
|
||||
|
||||
for (const reference of tethysReferences) {
|
||||
processedCount++;
|
||||
|
||||
|
|
@ -113,6 +162,15 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`);
|
||||
}
|
||||
|
||||
// Double-check that this relation is in our allowed list (safety check)
|
||||
if (!this.ALLOWED_RELATIONS.includes(reference.relation)) {
|
||||
skippedCount++;
|
||||
if (this.verbose) {
|
||||
this.logger.info(`⏭️ Skipping relation "${reference.relation}" - not in allowed list`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract dataset publish_id from DOI or URL
|
||||
const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value);
|
||||
|
||||
|
|
@ -127,6 +185,7 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
const targetDataset = await Dataset.query()
|
||||
.where('publish_id', targetDatasetPublish)
|
||||
.where('server_state', 'published')
|
||||
.preload('identifier')
|
||||
.first();
|
||||
|
||||
if (!targetDataset) {
|
||||
|
|
@ -145,25 +204,31 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
// Check if reverse reference exists
|
||||
const reverseReferenceExists = await this.checkReverseReferenceExists(
|
||||
targetDataset.id,
|
||||
reference.document_id,
|
||||
// reference.document_id,
|
||||
reference.relation,
|
||||
);
|
||||
|
||||
if (!reverseReferenceExists) {
|
||||
missingReferences.push({
|
||||
sourceDatasetId: reference.document_id,
|
||||
targetDatasetId: targetDataset.id,
|
||||
sourcePublishId: reference.dataset.publish_id || null,
|
||||
targetPublishId: targetDataset.publish_id || null,
|
||||
referenceType: reference.type,
|
||||
relation: reference.relation,
|
||||
doi: reference.value,
|
||||
reverseRelation: this.getReverseRelation(reference.relation),
|
||||
});
|
||||
const reverseRelation = this.getReverseRelation(reference.relation);
|
||||
if (reverseRelation) {
|
||||
// Only add if we have a valid reverse relation
|
||||
missingReferences.push({
|
||||
sourceDatasetId: reference.document_id,
|
||||
targetDatasetId: targetDataset.id,
|
||||
sourcePublishId: reference.dataset.publish_id || null,
|
||||
targetPublishId: targetDataset.publish_id || null,
|
||||
referenceType: reference.type,
|
||||
relation: reference.relation,
|
||||
doi: reference.value,
|
||||
reverseRelation: reverseRelation,
|
||||
sourceDoi: reference.dataset.identifier ? reference.dataset.identifier.value : null,
|
||||
targetDoi: targetDataset.identifier ? targetDataset.identifier.value : null,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.info(`✅ Processed all ${processedCount} references`);
|
||||
this.logger.info(`✅ Processed ${processedCount} references (${skippedCount} skipped due to relation filtering)`);
|
||||
return missingReferences;
|
||||
}
|
||||
|
||||
|
|
@ -183,64 +248,47 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
return null;
|
||||
}
|
||||
|
||||
private async checkReverseReferenceExists(
|
||||
sourceDatasetId: number,
|
||||
targetDatasetId: number,
|
||||
originalRelation: string,
|
||||
): Promise<boolean> {
|
||||
private async checkReverseReferenceExists(targetDatasetId: number, originalRelation: string): Promise<boolean> {
|
||||
const reverseRelation = this.getReverseRelation(originalRelation);
|
||||
|
||||
if (!reverseRelation) {
|
||||
return true; // If no reverse relation is defined, consider it as "exists" to skip processing
|
||||
}
|
||||
|
||||
// Only check for reverse references where the source dataset is also published
|
||||
const reverseReference = await DatasetReference.query()
|
||||
.where('document_id', sourceDatasetId)
|
||||
// We don't filter by source document_id here to find any incoming reference from any published dataset
|
||||
// .where('document_id', sourceDatasetId)
|
||||
.where('related_document_id', targetDatasetId)
|
||||
.where('relation', reverseRelation)
|
||||
.whereHas('dataset', (datasetQuery) => {
|
||||
datasetQuery.where('server_state', 'published');
|
||||
})
|
||||
.first();
|
||||
|
||||
return !!reverseReference;
|
||||
}
|
||||
|
||||
private getReverseRelation(relation: string): string {
|
||||
private getReverseRelation(relation: string): string | null {
|
||||
const relationMap: Record<string, string> = {
|
||||
IsNewVersionOf: 'IsPreviousVersionOf',
|
||||
IsPreviousVersionOf: 'IsNewVersionOf',
|
||||
|
||||
IsVersionOf: 'HasVersion',
|
||||
HasVersion: 'IsVersionOf',
|
||||
|
||||
Compiles: 'IsCompiledBy',
|
||||
IsCompiledBy: 'Compiles',
|
||||
|
||||
IsVariantFormOf: 'IsOriginalFormOf',
|
||||
IsOriginalFormOf: 'IsVariantFormOf',
|
||||
|
||||
IsPartOf: 'HasPart',
|
||||
HasPart: 'IsPartOf',
|
||||
|
||||
IsSupplementTo: 'IsSupplementedBy',
|
||||
IsSupplementedBy: 'IsSupplementTo',
|
||||
|
||||
Continues: 'IsContinuedBy',
|
||||
IsContinuedBy: 'Continues',
|
||||
};
|
||||
|
||||
// to catch relation types like 'compiles' or 'IsVariantFormOf' that are not in the map mark reverse as 'HasVersion'
|
||||
return relationMap[relation] || 'HasVersion'; // Default fallback
|
||||
// Only return reverse relation if it exists in our map, otherwise return null
|
||||
return relationMap[relation] || null;
|
||||
}
|
||||
|
||||
private printMissingReferencesList(missingReferences: MissingCrossReference[]) {
|
||||
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
|
||||
console.log('│ MISSING CROSS-REFERENCES REPORT │');
|
||||
console.log('│ (Published Datasets Only) │');
|
||||
console.log('│ (Published Datasets Only - Filtered Relations) │');
|
||||
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
|
||||
console.log();
|
||||
|
||||
missingReferences.forEach((missing, index) => {
|
||||
console.log(
|
||||
`${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}) → Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId})`,
|
||||
`${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId} Identifier: ${missing.sourceDoi})
|
||||
${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId} Identifier: ${missing.targetDoi})`,
|
||||
);
|
||||
console.log(` ├─ Current relation: "${missing.relation}"`);
|
||||
console.log(` ├─ Missing reverse relation: "${missing.reverseRelation}"`);
|
||||
|
|
@ -251,6 +299,7 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
|
||||
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
|
||||
console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected │`);
|
||||
console.log(`│ Processed relations: ${this.ALLOWED_RELATIONS.join(', ')} │`);
|
||||
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
|
||||
}
|
||||
|
||||
|
|
@ -262,27 +311,37 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
|
||||
for (const [index, missing] of missingReferences.entries()) {
|
||||
try {
|
||||
// Get the source dataset to create proper reference - ensure it's published
|
||||
// Get both source and target datasets
|
||||
const sourceDataset = await Dataset.query()
|
||||
.where('id', missing.sourceDatasetId)
|
||||
.where('server_state', 'published')
|
||||
.preload('identifier')
|
||||
.first();
|
||||
|
||||
const targetDataset = await Dataset.query().where('id', missing.targetDatasetId).where('server_state', 'published').first();
|
||||
|
||||
if (!sourceDataset) {
|
||||
this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`);
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create the reverse reference
|
||||
if (!targetDataset) {
|
||||
this.logger.warning(`⚠️ Target dataset ${missing.targetDatasetId} not found or not published, skipping...`);
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create the reverse reference using the referenced_by relationship
|
||||
// Example: If Dataset 297 IsNewVersionOf Dataset 144
|
||||
// We create an incoming reference for Dataset 144 that shows Dataset 297 IsPreviousVersionOf it
|
||||
const reverseReference = new DatasetReference();
|
||||
reverseReference.document_id = missing.targetDatasetId;
|
||||
reverseReference.related_document_id = missing.sourceDatasetId;
|
||||
// Don't set document_id - this creates an incoming reference via related_document_id
|
||||
reverseReference.related_document_id = missing.targetDatasetId; // 144 (dataset receiving the incoming reference)
|
||||
reverseReference.type = 'DOI';
|
||||
reverseReference.relation = missing.reverseRelation;
|
||||
|
||||
// Use the source dataset's DOI for the value
|
||||
// Use the source dataset's DOI for the value (what's being referenced)
|
||||
if (sourceDataset.identifier?.value) {
|
||||
reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`;
|
||||
} else {
|
||||
|
|
@ -293,12 +352,16 @@ export default class DetectMissingCrossReferences extends BaseCommand {
|
|||
// Use the source dataset's main title for the label
|
||||
reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`;
|
||||
|
||||
// Also save 'server_date_modified' on target dataset to trigger any downstream updates (e.g. search index)
|
||||
targetDataset.server_date_modified = DateTime.now();
|
||||
await targetDataset.save();
|
||||
|
||||
await reverseReference.save();
|
||||
fixedCount++;
|
||||
|
||||
if (this.verbose) {
|
||||
this.logger.info(
|
||||
`✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.targetDatasetId} -> ${missing.sourceDatasetId}`,
|
||||
`✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.sourceDatasetId} -> ${missing.targetDatasetId} (${missing.reverseRelation})`,
|
||||
);
|
||||
} else if ((index + 1) % 10 === 0) {
|
||||
this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`);
|
||||
|
|
|
|||
346
commands/list_updatable_datacite.ts
Normal file
346
commands/list_updatable_datacite.ts
Normal file
|
|
@ -0,0 +1,346 @@
|
|||
/*
|
||||
|--------------------------------------------------------------------------
|
||||
| node ace make:command list-updateable-datacite
|
||||
| DONE: create commands/list_updeatable_datacite.ts
|
||||
|--------------------------------------------------------------------------
|
||||
*/
|
||||
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
||||
import { CommandOptions } from '@adonisjs/core/types/ace';
|
||||
import Dataset from '#models/dataset';
|
||||
import { DoiClient } from '#app/Library/Doi/DoiClient';
|
||||
import env from '#start/env';
|
||||
import logger from '@adonisjs/core/services/logger';
|
||||
import { DateTime } from 'luxon';
|
||||
import pLimit from 'p-limit';
|
||||
|
||||
export default class ListUpdateableDatacite extends BaseCommand {
|
||||
static commandName = 'list:updateable-datacite';
|
||||
static description = 'List all datasets that need DataCite DOI updates';
|
||||
|
||||
public static needsApplication = true;
|
||||
|
||||
// private chunkSize = 100; // Set chunk size for pagination
|
||||
|
||||
@flags.boolean({ alias: 'v', description: 'Verbose output showing detailed information' })
|
||||
public verbose: boolean = false;
|
||||
|
||||
@flags.boolean({ alias: 'c', description: 'Show only count of updatable datasets' })
|
||||
public countOnly: boolean = false;
|
||||
|
||||
@flags.boolean({ alias: 'i', description: 'Show only publish IDs (useful for scripting)' })
|
||||
public idsOnly: boolean = false;
|
||||
|
||||
@flags.number({ description: 'Chunk size for processing datasets (default: 50)' })
|
||||
public chunkSize: number = 50;
|
||||
|
||||
//example: node ace list:updateable-datacite
|
||||
//example: node ace list:updateable-datacite --verbose
|
||||
//example: node ace list:updateable-datacite --count-only
|
||||
//example: node ace list:updateable-datacite --ids-only
|
||||
//example: node ace list:updateable-datacite --chunk-size 50
|
||||
|
||||
public static options: CommandOptions = {
|
||||
startApp: true,
|
||||
stayAlive: false,
|
||||
};
|
||||
|
||||
async run() {
|
||||
const prefix = env.get('DATACITE_PREFIX', '');
|
||||
const base_domain = env.get('BASE_DOMAIN', '');
|
||||
|
||||
if (!prefix || !base_domain) {
|
||||
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
|
||||
return;
|
||||
}
|
||||
|
||||
// Prevent conflicting flags
|
||||
if ((this.verbose && this.countOnly) || (this.verbose && this.idsOnly)) {
|
||||
logger.error('Flags --verbose cannot be combined with --count-only or --ids-only');
|
||||
return;
|
||||
}
|
||||
|
||||
const chunkSize = this.chunkSize || 50;
|
||||
let page = 1;
|
||||
let hasMoreDatasets = true;
|
||||
let totalProcessed = 0;
|
||||
const updatableDatasets: Dataset[] = [];
|
||||
|
||||
if (!this.countOnly && !this.idsOnly) {
|
||||
logger.info(`Processing datasets in chunks of ${chunkSize}...`);
|
||||
}
|
||||
|
||||
while (hasMoreDatasets) {
|
||||
const datasets = await this.getDatasets(page, chunkSize);
|
||||
|
||||
if (datasets.length === 0) {
|
||||
hasMoreDatasets = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!this.countOnly && !this.idsOnly) {
|
||||
logger.info(`Processing chunk ${page} (${datasets.length} datasets)...`);
|
||||
}
|
||||
|
||||
const chunkUpdatableDatasets = await this.processChunk(datasets);
|
||||
updatableDatasets.push(...chunkUpdatableDatasets);
|
||||
totalProcessed += datasets.length;
|
||||
|
||||
page += 1;
|
||||
if (datasets.length < chunkSize) {
|
||||
hasMoreDatasets = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.countOnly && !this.idsOnly) {
|
||||
logger.info(`Processed ${totalProcessed} datasets total, found ${updatableDatasets.length} that need updates`);
|
||||
}
|
||||
|
||||
if (this.countOnly) {
|
||||
console.log(updatableDatasets.length);
|
||||
} else if (this.idsOnly) {
|
||||
updatableDatasets.forEach((dataset) => console.log(dataset.publish_id));
|
||||
} else if (this.verbose) {
|
||||
await this.showVerboseOutput(updatableDatasets);
|
||||
} else {
|
||||
this.showSimpleOutput(updatableDatasets);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a chunk of datasets to determine which ones need DataCite updates
|
||||
*
|
||||
* This method handles parallel processing of datasets within a chunk, providing
|
||||
* efficient error handling and filtering of results.
|
||||
*
|
||||
* @param datasets - Array of Dataset objects to process
|
||||
* @returns Promise<Dataset[]> - Array of datasets that need updates
|
||||
*/
|
||||
// private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
|
||||
// // Process datasets in parallel using Promise.allSettled for better error handling
|
||||
// //
|
||||
// // Why Promise.allSettled vs Promise.all?
|
||||
// // - Promise.all fails fast: if ANY promise rejects, the entire operation fails
|
||||
// // - Promise.allSettled waits for ALL promises: some can fail, others succeed
|
||||
// // - This is crucial for batch processing where we don't want one bad dataset
|
||||
// // to stop processing of the entire chunk
|
||||
// const results = await Promise.allSettled(
|
||||
// datasets.map(async (dataset) => {
|
||||
// try {
|
||||
// // Check if this specific dataset needs a DataCite update
|
||||
// const needsUpdate = await this.shouldUpdateDataset(dataset);
|
||||
|
||||
// // Return the dataset if it needs update, null if it doesn't
|
||||
// // This creates a sparse array that we'll filter later
|
||||
// return needsUpdate ? dataset : null;
|
||||
// } catch (error) {
|
||||
// // Error handling for individual dataset checks
|
||||
// //
|
||||
// // Log warnings only if we're not in silent modes (count-only or ids-only)
|
||||
// // This prevents log spam when running automated scripts
|
||||
// if (!this.countOnly && !this.idsOnly) {
|
||||
// logger.warn(`Error checking dataset ${dataset.publish_id}: ${error.message}`);
|
||||
// }
|
||||
|
||||
// // IMPORTANT DECISION: Return the dataset anyway if we can't determine status
|
||||
// //
|
||||
// // Why? It's safer to include a dataset that might not need updating
|
||||
// // than to miss one that actually does need updating. This follows the
|
||||
// // "fail-safe" principle - if we're unsure, err on the side of caution
|
||||
// return dataset;
|
||||
// }
|
||||
// }),
|
||||
// );
|
||||
|
||||
// // Filter and extract results from Promise.allSettled response
|
||||
// //
|
||||
// // Promise.allSettled returns an array of objects with this structure:
|
||||
// // - { status: 'fulfilled', value: T } for successful promises
|
||||
// // - { status: 'rejected', reason: Error } for failed promises
|
||||
// //
|
||||
// // We need to:
|
||||
// // 1. Only get fulfilled results (rejected ones are already handled above)
|
||||
// // 2. Filter out null values (datasets that don't need updates)
|
||||
// // 3. Extract the actual Dataset objects from the wrapper
|
||||
// return results
|
||||
// .filter(
|
||||
// (result): result is PromiseFulfilledResult<Dataset | null> =>
|
||||
// // Type guard: only include fulfilled results that have actual values
|
||||
// // This filters out:
|
||||
// // - Rejected promises (shouldn't happen due to try/catch, but safety first)
|
||||
// // - Fulfilled promises that returned null (datasets that don't need updates)
|
||||
// result.status === 'fulfilled' && result.value !== null,
|
||||
// )
|
||||
// .map((result) => result.value!); // Extract the Dataset from the wrapper
|
||||
// // The ! is safe here because we filtered out null values above
|
||||
// }
|
||||
|
||||
private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
|
||||
// Limit concurrency to avoid API flooding (e.g., max 5 at once)
|
||||
const limit = pLimit(5);
|
||||
|
||||
const tasks = datasets.map((dataset) =>
|
||||
limit(async () => {
|
||||
try {
|
||||
const needsUpdate = await this.shouldUpdateDataset(dataset);
|
||||
return needsUpdate ? dataset : null;
|
||||
} catch (error) {
|
||||
if (!this.countOnly && !this.idsOnly) {
|
||||
logger.warn(
|
||||
`Error checking dataset ${dataset.publish_id}: ${
|
||||
error instanceof Error ? error.message : JSON.stringify(error)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
// Fail-safe: include dataset if uncertain
|
||||
return dataset;
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
const results = await Promise.allSettled(tasks);
|
||||
|
||||
return results
|
||||
.filter((result): result is PromiseFulfilledResult<Dataset | null> => result.status === 'fulfilled' && result.value !== null)
|
||||
.map((result) => result.value!);
|
||||
}
|
||||
|
||||
private async getDatasets(page: number, chunkSize: number): Promise<Dataset[]> {
|
||||
return await Dataset.query()
|
||||
.orderBy('publish_id', 'asc')
|
||||
.preload('identifier')
|
||||
.preload('xmlCache')
|
||||
.preload('titles')
|
||||
.where('server_state', 'published')
|
||||
.whereHas('identifier', (identifierQuery) => {
|
||||
identifierQuery.where('type', 'doi');
|
||||
})
|
||||
.forPage(page, chunkSize); // Get files for the current page
|
||||
}
|
||||
|
||||
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
|
||||
try {
|
||||
let doiIdentifier = dataset.identifier;
|
||||
if (!doiIdentifier) {
|
||||
await dataset.load('identifier');
|
||||
doiIdentifier = dataset.identifier;
|
||||
}
|
||||
|
||||
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const datasetModified =
|
||||
dataset.server_date_modified instanceof DateTime
|
||||
? dataset.server_date_modified
|
||||
: DateTime.fromJSDate(dataset.server_date_modified);
|
||||
|
||||
if (!datasetModified) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (datasetModified > DateTime.now()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const doiClient = new DoiClient();
|
||||
const DOI_CHECK_TIMEOUT = 300; // ms
|
||||
|
||||
const doiLastModified = await Promise.race([
|
||||
doiClient.getDoiLastModified(doiIdentifier.value),
|
||||
this.createTimeoutPromise(DOI_CHECK_TIMEOUT),
|
||||
]).catch(() => null);
|
||||
|
||||
if (!doiLastModified) {
|
||||
// If uncertain, better include dataset for update
|
||||
return true;
|
||||
}
|
||||
|
||||
const doiModified = DateTime.fromJSDate(doiLastModified);
|
||||
if (datasetModified > doiModified) {
|
||||
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
|
||||
const toleranceSeconds = 600;
|
||||
return diffInSeconds > toleranceSeconds;
|
||||
}
|
||||
return false;
|
||||
} catch (error) {
|
||||
return true; // safer: include dataset if unsure
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a timeout promise for API calls
|
||||
*/
|
||||
private createTimeoutPromise(timeoutMs: number): Promise<never> {
|
||||
return new Promise((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`API call timeout after ${timeoutMs}ms`)), timeoutMs);
|
||||
});
|
||||
}
|
||||
|
||||
private showSimpleOutput(updatableDatasets: Dataset[]): void {
|
||||
if (updatableDatasets.length === 0) {
|
||||
console.log('No datasets need DataCite updates.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
|
||||
|
||||
updatableDatasets.forEach((dataset) => {
|
||||
console.log(`publish_id ${dataset.publish_id} needs update - ${dataset.mainTitle || 'Untitled'}`);
|
||||
});
|
||||
|
||||
console.log(`\nTo update these datasets, run:`);
|
||||
console.log(` node ace update:datacite`);
|
||||
console.log(`\nOr update specific datasets:`);
|
||||
console.log(` node ace update:datacite -p <publish_id>`);
|
||||
}
|
||||
|
||||
private async showVerboseOutput(updatableDatasets: Dataset[]): Promise<void> {
|
||||
if (updatableDatasets.length === 0) {
|
||||
console.log('No datasets need DataCite updates.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
|
||||
|
||||
for (const dataset of updatableDatasets) {
|
||||
await this.showDatasetDetails(dataset);
|
||||
}
|
||||
|
||||
console.log(`\nSummary: ${updatableDatasets.length} datasets need updates`);
|
||||
}
|
||||
|
||||
private async showDatasetDetails(dataset: Dataset): Promise<void> {
|
||||
try {
|
||||
let doiIdentifier = dataset.identifier;
|
||||
|
||||
if (!doiIdentifier) {
|
||||
await dataset.load('identifier');
|
||||
doiIdentifier = dataset.identifier;
|
||||
}
|
||||
|
||||
const doiValue = doiIdentifier?.value || 'N/A';
|
||||
const datasetModified = dataset.server_date_modified;
|
||||
|
||||
// Get DOI info from DataCite
|
||||
const doiClient = new DoiClient();
|
||||
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
|
||||
const doiState = await doiClient.getDoiState(doiValue);
|
||||
|
||||
console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
|
||||
console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`);
|
||||
console.log(`│ DOI: ${doiValue}`);
|
||||
console.log(`│ DOI State: ${doiState || 'Unknown'}`);
|
||||
console.log(`│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}`);
|
||||
console.log(`│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}`);
|
||||
console.log(`│ Status: NEEDS UPDATE`);
|
||||
console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
|
||||
} catch (error) {
|
||||
console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
|
||||
console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`);
|
||||
console.log(`│ DOI: ${dataset.identifier?.value || 'N/A'}`);
|
||||
console.log(`│ Error: ${error.message}`);
|
||||
console.log(`│ Status: NEEDS UPDATE (Error checking)`);
|
||||
console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -122,58 +122,53 @@ export default class UpdateDatacite extends BaseCommand {
|
|||
|
||||
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
|
||||
try {
|
||||
// Check if dataset has a DOI identifier (HasOne relationship)
|
||||
let doiIdentifier = dataset.identifier;
|
||||
|
||||
if (!doiIdentifier) {
|
||||
// Try to load the relationship if not already loaded
|
||||
await dataset.load('identifier');
|
||||
doiIdentifier = dataset.identifier;
|
||||
}
|
||||
|
||||
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
||||
logger.warn(`Dataset ${dataset.publish_id}: No DOI identifier found`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate dataset modification date
|
||||
const datasetModified = dataset.server_date_modified;
|
||||
const now = DateTime.now();
|
||||
|
||||
if (!datasetModified) {
|
||||
logger.error(`Dataset ${dataset.publish_id}: server_date_modified is null or undefined`);
|
||||
return true; // Update anyway if modification date is missing
|
||||
return true; // Update if modification date is missing
|
||||
}
|
||||
|
||||
if (datasetModified > now) {
|
||||
logger.error(
|
||||
`Dataset ${dataset.publish_id}: server_date_modified (${datasetModified.toISO()}) is in the future! ` +
|
||||
`Current time: ${now.toISO()}. This indicates a data integrity issue. Skipping update.`,
|
||||
);
|
||||
return false; // Do not update when modification date is invalid
|
||||
return false; // Skip invalid future dates
|
||||
}
|
||||
|
||||
// Get DOI information from DataCite using DoiClient
|
||||
// Check DataCite DOI modification date
|
||||
const doiClient = new DoiClient();
|
||||
const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);
|
||||
|
||||
if (!doiLastModified) {
|
||||
logger.warn(`Dataset ${dataset.publish_id}: Could not retrieve DOI modification date from DataCite`);
|
||||
return true; // Update anyway if we can't get DOI info
|
||||
return false; // not Update if we can't get DOI info
|
||||
}
|
||||
|
||||
// Compare dataset modification date with DOI modification date
|
||||
const doiModified = DateTime.fromJSDate(doiLastModified);
|
||||
if (datasetModified > doiModified) {
|
||||
// if dataset was modified after DOI creation
|
||||
// Calculate the difference in seconds
|
||||
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
|
||||
|
||||
logger.debug(
|
||||
`Dataset ${dataset.publish_id}: Dataset modified: ${datasetModified.toISO()}, DOI modified: ${doiModified.toISO()}`,
|
||||
);
|
||||
// Define tolerance threshold (60 seconds = 1 minute)
|
||||
const toleranceSeconds = 60;
|
||||
|
||||
// Update if dataset was modified after the DOI record
|
||||
return datasetModified > doiModified;
|
||||
// Only update if the difference is greater than the tolerance
|
||||
// This prevents unnecessary updates for minor timestamp differences
|
||||
return diffInSeconds > toleranceSeconds;
|
||||
} else {
|
||||
return false; // No update needed
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warn(`Error checking update status for dataset ${dataset.publish_id}: ${error.message}`);
|
||||
return true; // Update anyway if we can't determine status
|
||||
return false; // not update if we can't determine status or other error
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue