feat: Enhance ClamAV Docker entrypoint and configuration

- Updated docker-entrypoint.sh to improve ClamAV service initialization and logging.
- Added checks for ClamAV and freshclam daemon status.
- Optimized freshclam configuration for container usage, including logging to stdout and setting database directory.
- Introduced caching mechanism for enabled file extensions in vinejs_provider.ts to reduce database queries.
- Implemented a new command to list datasets needing DataCite DOI updates, with options for verbose output, count only, and IDs only.
- Updated package dependencies to include p-limit and pino-pretty.
- finalized ace command 'detect:missing-cross-references'
This commit is contained in:
Kaimbacher 2025-09-26 12:19:35 +02:00
commit 6757bdb77c
10 changed files with 745 additions and 430 deletions

View file

@ -122,58 +122,53 @@ export default class UpdateDatacite extends BaseCommand {
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
try {
// Check if dataset has a DOI identifier (HasOne relationship)
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
// Try to load the relationship if not already loaded
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
logger.warn(`Dataset ${dataset.publish_id}: No DOI identifier found`);
return false;
}
// Validate dataset modification date
const datasetModified = dataset.server_date_modified;
const now = DateTime.now();
if (!datasetModified) {
logger.error(`Dataset ${dataset.publish_id}: server_date_modified is null or undefined`);
return true; // Update anyway if modification date is missing
return true; // Update if modification date is missing
}
if (datasetModified > now) {
logger.error(
`Dataset ${dataset.publish_id}: server_date_modified (${datasetModified.toISO()}) is in the future! ` +
`Current time: ${now.toISO()}. This indicates a data integrity issue. Skipping update.`,
);
return false; // Do not update when modification date is invalid
return false; // Skip invalid future dates
}
// Get DOI information from DataCite using DoiClient
// Check DataCite DOI modification date
const doiClient = new DoiClient();
const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);
if (!doiLastModified) {
logger.warn(`Dataset ${dataset.publish_id}: Could not retrieve DOI modification date from DataCite`);
return true; // Update anyway if we can't get DOI info
return false; // not Update if we can't get DOI info
}
// Compare dataset modification date with DOI modification date
const doiModified = DateTime.fromJSDate(doiLastModified);
if (datasetModified > doiModified) {
// if dataset was modified after DOI creation
// Calculate the difference in seconds
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
logger.debug(
`Dataset ${dataset.publish_id}: Dataset modified: ${datasetModified.toISO()}, DOI modified: ${doiModified.toISO()}`,
);
// Define tolerance threshold (60 seconds = 1 minute)
const toleranceSeconds = 60;
// Update if dataset was modified after the DOI record
return datasetModified > doiModified;
// Only update if the difference is greater than the tolerance
// This prevents unnecessary updates for minor timestamp differences
return diffInSeconds > toleranceSeconds;
} else {
return false; // No update needed
}
} catch (error) {
logger.warn(`Error checking update status for dataset ${dataset.publish_id}: ${error.message}`);
return true; // Update anyway if we can't determine status
return false; // not update if we can't determine status or other error
}
}