diff --git a/Dockerfile b/Dockerfile index a5d1263..0d2f959 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,55 +1,61 @@ ################## First Stage - Creating base ######################### # Created a variable to hold our node base image -ARG NODE_IMAGE=node:22-bookworm-slim +ARG NODE_IMAGE=node:22-trixie-slim FROM $NODE_IMAGE AS base + # Install dumb-init and ClamAV, and perform ClamAV database update -RUN apt update \ - && apt-get install -y dumb-init clamav clamav-daemon nano \ +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + dumb-init \ + clamav \ + clamav-daemon \ + ca-certificates \ && rm -rf /var/lib/apt/lists/* \ # Creating folders and changing ownerships - && mkdir -p /home/node/app && chown node:node /home/node/app \ + && mkdir -p /home/node/app \ && mkdir -p /var/lib/clamav \ && mkdir /usr/local/share/clamav \ - && chown -R node:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav \ - # permissions && mkdir /var/run/clamav \ - && chown node:clamav /var/run/clamav \ - && chmod 750 /var/run/clamav -# ----------------------------------------------- -# --- ClamAV & FeshClam ------------------------- -# ----------------------------------------------- -# RUN \ -# chmod 644 /etc/clamav/freshclam.conf && \ -# freshclam && \ -# mkdir /var/run/clamav && \ - # chown -R clamav:root /var/run/clamav + && mkdir -p /var/log/clamav \ + && mkdir -p /tmp/clamav-logs \ + + # Set ownership and permissions + && chown node:node /home/node/app \ + # && chown -R node:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav /var/run/clamav \ + && chown -R clamav:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav /var/run/clamav /var/log/clamav \ + && chmod 755 /tmp/clamav-logs \ + && chmod 750 /var/run/clamav \ + && chmod 755 /var/lib/clamav \ + && chmod 755 /var/log/clamav \ + # Add node user to clamav group and allow sudo for clamav commands + && usermod -a -G clamav node \ + && chmod g+w /var/run/clamav /var/lib/clamav /var/log/clamav /tmp/clamav-logs -# # initial update of av databases -# RUN freshclam -# Configure Clam AV... -COPY --chown=node:clamav ./*.conf /etc/clamav/ +# Configure ClamAV - copy config files before switching user +# COPY --chown=node:clamav ./*.conf /etc/clamav/ +COPY --chown=clamav:clamav ./*.conf /etc/clamav/ + +# Copy entrypoint script +COPY --chown=node:node docker-entrypoint.sh /home/node/app/docker-entrypoint.sh +RUN chmod +x /home/node/app/docker-entrypoint.sh + +ENV TZ="Europe/Vienna" -# # permissions -# RUN mkdir /var/run/clamav && \ -# chown node:clamav /var/run/clamav && \ -# chmod 750 /var/run/clamav # Setting the working directory WORKDIR /home/node/app # Changing the current active user to "node" + +# Download initial ClamAV database as root before switching users +USER root +RUN freshclam --quiet || echo "Initial database download failed - will retry at runtime" + USER node -# initial update of av databases -RUN freshclam - -# VOLUME /var/lib/clamav -COPY --chown=node:clamav docker-entrypoint.sh /home/node/app/docker-entrypoint.sh -RUN chmod +x /home/node/app/docker-entrypoint.sh -ENV TZ="Europe/Vienna" - - +# Initial update of AV databases (moved after USER directive) +# RUN freshclam || true ################## Second Stage - Installing dependencies ########## @@ -70,14 +76,13 @@ ENV NODE_ENV=production # We run "node ace build" to build the app (dist folder) for production RUN node ace build --ignore-ts-errors # RUN node ace build --production -# RUN node ace build --ignore-ts-errors ################## Final Stage - Production ######################### # In this final stage, we will start running the application FROM base AS production # Here, we include all the required environment variables -# ENV NODE_ENV=production +ENV NODE_ENV=production # ENV PORT=$PORT # ENV HOST=0.0.0.0 @@ -91,4 +96,4 @@ COPY --chown=node:node --from=build /home/node/app/build . EXPOSE 3333 ENTRYPOINT ["/home/node/app/docker-entrypoint.sh"] # Run the command to start the server using "dumb-init" -CMD [ "dumb-init", "node", "bin/server.js" ] \ No newline at end of file +CMD [ "node", "bin/server.js" ] \ No newline at end of file diff --git a/adonisrc.ts b/adonisrc.ts index de94c63..e42693a 100644 --- a/adonisrc.ts +++ b/adonisrc.ts @@ -30,9 +30,9 @@ export default defineConfig({ () => import('#start/rules/unique'), () => import('#start/rules/translated_language'), () => import('#start/rules/unique_person'), - () => import('#start/rules/file_length'), - () => import('#start/rules/file_scan'), - () => import('#start/rules/allowed_extensions_mimetypes'), + // () => import('#start/rules/file_length'), + // () => import('#start/rules/file_scan'), + // () => import('#start/rules/allowed_extensions_mimetypes'), () => import('#start/rules/dependent_array_min_length'), () => import('#start/rules/referenceValidation'), () => import('#start/rules/valid_mimetype'), diff --git a/commands/fix_dataset_cross_references.ts b/commands/fix_dataset_cross_references.ts index 2662e25..248fefd 100644 --- a/commands/fix_dataset_cross_references.ts +++ b/commands/fix_dataset_cross_references.ts @@ -6,6 +6,7 @@ */ import { BaseCommand, flags } from '@adonisjs/core/ace'; import type { CommandOptions } from '@adonisjs/core/types/ace'; +import { DateTime } from 'luxon'; import Dataset from '#models/dataset'; import DatasetReference from '#models/dataset_reference'; // import env from '#start/env'; @@ -15,6 +16,8 @@ interface MissingCrossReference { targetDatasetId: number; sourcePublishId: number | null; targetPublishId: number | null; + sourceDoi: string | null; + targetDoi: string | null; referenceType: string; relation: string; doi: string | null; @@ -33,30 +36,58 @@ export default class DetectMissingCrossReferences extends BaseCommand { @flags.boolean({ alias: 'v', description: 'Verbose output' }) public verbose: boolean = false; + @flags.number({ alias: 'p', description: 'Filter by specific publish_id (source or target dataset)' }) + public publish_id?: number; + + // example: node ace detect:missing-cross-references --verbose -p 227 //if you want to filter by specific publish_id with details + // example: node ace detect:missing-cross-references --verbose + // example: node ace detect:missing-cross-references --fix -p 227 //if you want to filter by specific publish_id and fix it + // example: node ace detect:missing-cross-references + public static options: CommandOptions = { startApp: true, staysAlive: false, }; + // Define the allowed relations that we want to process + private readonly ALLOWED_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf', 'IsVariantFormOf', 'IsOriginalFormOf']; + async run() { this.logger.info('πŸ” Detecting missing cross-references...'); + this.logger.info(`πŸ“‹ Processing only these relations: ${this.ALLOWED_RELATIONS.join(', ')}`); + + if (this.publish_id) { + this.logger.info(`Filtering by publish_id: ${this.publish_id}`); + } try { const missingReferences = await this.findMissingCrossReferences(); if (missingReferences.length === 0) { - this.logger.success('All cross-references are properly linked!'); + const filterMsg = this.publish_id ? ` for publish_id ${this.publish_id}` : ''; + this.logger.success(`All cross-references are properly linked for the specified relations${filterMsg}!`); return; } - this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s):`); + const filterMsg = this.publish_id ? ` (filtered by publish_id ${this.publish_id})` : ''; + this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s)${filterMsg}:`); - for (const missing of missingReferences) { - this.logger.info( - `Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`, - ); + // Show brief list if not verbose mode + if (!this.verbose) { + for (const missing of missingReferences) { + const sourceDoi = missing.sourceDoi ? ` DOI: ${missing.sourceDoi}` : ''; + const targetDoi = missing.targetDoi ? ` DOI: ${missing.targetDoi}` : ''; - if (this.verbose) { + this.logger.info( + `Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}${sourceDoi}) ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId}${targetDoi}) β†’ missing reverse: ${missing.reverseRelation}`, + ); + } + } else { + // Verbose mode - show detailed info + for (const missing of missingReferences) { + this.logger.info( + `Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`, + ); this.logger.info(` - Reference type: ${missing.referenceType}`); this.logger.info(` - Relation: ${missing.relation}`); this.logger.info(` - DOI: ${missing.doi}`); @@ -67,20 +98,28 @@ export default class DetectMissingCrossReferences extends BaseCommand { await this.fixMissingReferences(missingReferences); this.logger.success('All missing cross-references have been fixed!'); } else { - this.printMissingReferencesList(missingReferences); + if (this.verbose) { + this.printMissingReferencesList(missingReferences); + } this.logger.info('πŸ’‘ Run with --fix flag to automatically create missing cross-references'); + if (this.publish_id) { + this.logger.info(`🎯 Currently filtering by publish_id: ${this.publish_id}`); + } } } catch (error) { this.logger.error('Error detecting missing cross-references:', error); process.exit(1); } } + private async findMissingCrossReferences(): Promise { const missingReferences: { sourceDatasetId: number; targetDatasetId: number; sourcePublishId: number | null; targetPublishId: number | null; + sourceDoi: string | null; + targetDoi: string | null; referenceType: string; relation: string; doi: string | null; @@ -90,22 +129,32 @@ export default class DetectMissingCrossReferences extends BaseCommand { this.logger.info('πŸ“Š Querying dataset references...'); // Find all references that point to Tethys datasets (DOI or URL containing tethys DOI) - // Only from datasets that are published - const tethysReferences = await DatasetReference.query() + // Only from datasets that are published AND only for allowed relations + const tethysReferencesQuery = DatasetReference.query() .whereIn('type', ['DOI', 'URL']) + .whereIn('relation', this.ALLOWED_RELATIONS) // Only process allowed relations .where((query) => { query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%'); }) .preload('dataset', (datasetQuery) => { - datasetQuery.where('server_state', 'published'); + datasetQuery.preload('identifier'); }) .whereHas('dataset', (datasetQuery) => { datasetQuery.where('server_state', 'published'); }); + if (typeof this.publish_id === 'number') { + tethysReferencesQuery.whereHas('dataset', (datasetQuery) => { + datasetQuery.where('publish_id', this.publish_id as number); + }); + } - this.logger.info(`πŸ”— Found ${tethysReferences.length} Tethys references from published datasets`); + const tethysReferences = await tethysReferencesQuery.exec(); + + this.logger.info(`πŸ”— Found ${tethysReferences.length} Tethys references from published datasets (allowed relations only)`); let processedCount = 0; + let skippedCount = 0; + for (const reference of tethysReferences) { processedCount++; @@ -113,6 +162,15 @@ export default class DetectMissingCrossReferences extends BaseCommand { this.logger.info(`πŸ“ˆ Processed ${processedCount}/${tethysReferences.length} references...`); } + // Double-check that this relation is in our allowed list (safety check) + if (!this.ALLOWED_RELATIONS.includes(reference.relation)) { + skippedCount++; + if (this.verbose) { + this.logger.info(`⏭️ Skipping relation "${reference.relation}" - not in allowed list`); + } + continue; + } + // Extract dataset publish_id from DOI or URL const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value); @@ -127,6 +185,7 @@ export default class DetectMissingCrossReferences extends BaseCommand { const targetDataset = await Dataset.query() .where('publish_id', targetDatasetPublish) .where('server_state', 'published') + .preload('identifier') .first(); if (!targetDataset) { @@ -145,25 +204,31 @@ export default class DetectMissingCrossReferences extends BaseCommand { // Check if reverse reference exists const reverseReferenceExists = await this.checkReverseReferenceExists( targetDataset.id, - reference.document_id, + // reference.document_id, reference.relation, ); if (!reverseReferenceExists) { - missingReferences.push({ - sourceDatasetId: reference.document_id, - targetDatasetId: targetDataset.id, - sourcePublishId: reference.dataset.publish_id || null, - targetPublishId: targetDataset.publish_id || null, - referenceType: reference.type, - relation: reference.relation, - doi: reference.value, - reverseRelation: this.getReverseRelation(reference.relation), - }); + const reverseRelation = this.getReverseRelation(reference.relation); + if (reverseRelation) { + // Only add if we have a valid reverse relation + missingReferences.push({ + sourceDatasetId: reference.document_id, + targetDatasetId: targetDataset.id, + sourcePublishId: reference.dataset.publish_id || null, + targetPublishId: targetDataset.publish_id || null, + referenceType: reference.type, + relation: reference.relation, + doi: reference.value, + reverseRelation: reverseRelation, + sourceDoi: reference.dataset.identifier ? reference.dataset.identifier.value : null, + targetDoi: targetDataset.identifier ? targetDataset.identifier.value : null, + }); + } } } - this.logger.info(`βœ… Processed all ${processedCount} references`); + this.logger.info(`βœ… Processed ${processedCount} references (${skippedCount} skipped due to relation filtering)`); return missingReferences; } @@ -183,64 +248,47 @@ export default class DetectMissingCrossReferences extends BaseCommand { return null; } - private async checkReverseReferenceExists( - sourceDatasetId: number, - targetDatasetId: number, - originalRelation: string, - ): Promise { + private async checkReverseReferenceExists(targetDatasetId: number, originalRelation: string): Promise { const reverseRelation = this.getReverseRelation(originalRelation); + if (!reverseRelation) { + return true; // If no reverse relation is defined, consider it as "exists" to skip processing + } + // Only check for reverse references where the source dataset is also published const reverseReference = await DatasetReference.query() - .where('document_id', sourceDatasetId) + // We don't filter by source document_id here to find any incoming reference from any published dataset + // .where('document_id', sourceDatasetId) .where('related_document_id', targetDatasetId) .where('relation', reverseRelation) - .whereHas('dataset', (datasetQuery) => { - datasetQuery.where('server_state', 'published'); - }) .first(); return !!reverseReference; } - private getReverseRelation(relation: string): string { + private getReverseRelation(relation: string): string | null { const relationMap: Record = { IsNewVersionOf: 'IsPreviousVersionOf', IsPreviousVersionOf: 'IsNewVersionOf', - - IsVersionOf: 'HasVersion', - HasVersion: 'IsVersionOf', - - Compiles: 'IsCompiledBy', - IsCompiledBy: 'Compiles', - IsVariantFormOf: 'IsOriginalFormOf', IsOriginalFormOf: 'IsVariantFormOf', - - IsPartOf: 'HasPart', - HasPart: 'IsPartOf', - - IsSupplementTo: 'IsSupplementedBy', - IsSupplementedBy: 'IsSupplementTo', - - Continues: 'IsContinuedBy', - IsContinuedBy: 'Continues', }; - // to catch relation types like 'compiles' or 'IsVariantFormOf' that are not in the map mark reverse as 'HasVersion' - return relationMap[relation] || 'HasVersion'; // Default fallback + // Only return reverse relation if it exists in our map, otherwise return null + return relationMap[relation] || null; } private printMissingReferencesList(missingReferences: MissingCrossReference[]) { console.log('β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”'); console.log('β”‚ MISSING CROSS-REFERENCES REPORT β”‚'); - console.log('β”‚ (Published Datasets Only) β”‚'); + console.log('β”‚ (Published Datasets Only - Filtered Relations) β”‚'); console.log('β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜'); console.log(); missingReferences.forEach((missing, index) => { console.log( - `${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}) β†’ Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId})`, + `${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId} Identifier: ${missing.sourceDoi}) + ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId} Identifier: ${missing.targetDoi})`, ); console.log(` β”œβ”€ Current relation: "${missing.relation}"`); console.log(` β”œβ”€ Missing reverse relation: "${missing.reverseRelation}"`); @@ -251,6 +299,7 @@ export default class DetectMissingCrossReferences extends BaseCommand { console.log('β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”'); console.log(`β”‚ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected β”‚`); + console.log(`β”‚ Processed relations: ${this.ALLOWED_RELATIONS.join(', ')} β”‚`); console.log('β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜'); } @@ -262,27 +311,37 @@ export default class DetectMissingCrossReferences extends BaseCommand { for (const [index, missing] of missingReferences.entries()) { try { - // Get the source dataset to create proper reference - ensure it's published + // Get both source and target datasets const sourceDataset = await Dataset.query() .where('id', missing.sourceDatasetId) .where('server_state', 'published') .preload('identifier') .first(); + const targetDataset = await Dataset.query().where('id', missing.targetDatasetId).where('server_state', 'published').first(); + if (!sourceDataset) { this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`); errorCount++; continue; } - // Create the reverse reference + if (!targetDataset) { + this.logger.warning(`⚠️ Target dataset ${missing.targetDatasetId} not found or not published, skipping...`); + errorCount++; + continue; + } + + // Create the reverse reference using the referenced_by relationship + // Example: If Dataset 297 IsNewVersionOf Dataset 144 + // We create an incoming reference for Dataset 144 that shows Dataset 297 IsPreviousVersionOf it const reverseReference = new DatasetReference(); - reverseReference.document_id = missing.targetDatasetId; - reverseReference.related_document_id = missing.sourceDatasetId; + // Don't set document_id - this creates an incoming reference via related_document_id + reverseReference.related_document_id = missing.targetDatasetId; // 144 (dataset receiving the incoming reference) reverseReference.type = 'DOI'; reverseReference.relation = missing.reverseRelation; - // Use the source dataset's DOI for the value + // Use the source dataset's DOI for the value (what's being referenced) if (sourceDataset.identifier?.value) { reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`; } else { @@ -293,12 +352,16 @@ export default class DetectMissingCrossReferences extends BaseCommand { // Use the source dataset's main title for the label reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`; + // Also save 'server_date_modified' on target dataset to trigger any downstream updates (e.g. search index) + targetDataset.server_date_modified = DateTime.now(); + await targetDataset.save(); + await reverseReference.save(); fixedCount++; if (this.verbose) { this.logger.info( - `βœ… [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.targetDatasetId} -> ${missing.sourceDatasetId}`, + `βœ… [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.sourceDatasetId} -> ${missing.targetDatasetId} (${missing.reverseRelation})`, ); } else if ((index + 1) % 10 === 0) { this.logger.info(`πŸ“ˆ Fixed ${fixedCount}/${missingReferences.length} references...`); diff --git a/commands/list_updatable_datacite.ts b/commands/list_updatable_datacite.ts new file mode 100644 index 0000000..aed411a --- /dev/null +++ b/commands/list_updatable_datacite.ts @@ -0,0 +1,346 @@ +/* +|-------------------------------------------------------------------------- +| node ace make:command list-updateable-datacite +| DONE: create commands/list_updeatable_datacite.ts +|-------------------------------------------------------------------------- +*/ +import { BaseCommand, flags } from '@adonisjs/core/ace'; +import { CommandOptions } from '@adonisjs/core/types/ace'; +import Dataset from '#models/dataset'; +import { DoiClient } from '#app/Library/Doi/DoiClient'; +import env from '#start/env'; +import logger from '@adonisjs/core/services/logger'; +import { DateTime } from 'luxon'; +import pLimit from 'p-limit'; + +export default class ListUpdateableDatacite extends BaseCommand { + static commandName = 'list:updateable-datacite'; + static description = 'List all datasets that need DataCite DOI updates'; + + public static needsApplication = true; + + // private chunkSize = 100; // Set chunk size for pagination + + @flags.boolean({ alias: 'v', description: 'Verbose output showing detailed information' }) + public verbose: boolean = false; + + @flags.boolean({ alias: 'c', description: 'Show only count of updatable datasets' }) + public countOnly: boolean = false; + + @flags.boolean({ alias: 'i', description: 'Show only publish IDs (useful for scripting)' }) + public idsOnly: boolean = false; + + @flags.number({ description: 'Chunk size for processing datasets (default: 50)' }) + public chunkSize: number = 50; + + //example: node ace list:updateable-datacite + //example: node ace list:updateable-datacite --verbose + //example: node ace list:updateable-datacite --count-only + //example: node ace list:updateable-datacite --ids-only + //example: node ace list:updateable-datacite --chunk-size 50 + + public static options: CommandOptions = { + startApp: true, + stayAlive: false, + }; + + async run() { + const prefix = env.get('DATACITE_PREFIX', ''); + const base_domain = env.get('BASE_DOMAIN', ''); + + if (!prefix || !base_domain) { + logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables'); + return; + } + + // Prevent conflicting flags + if ((this.verbose && this.countOnly) || (this.verbose && this.idsOnly)) { + logger.error('Flags --verbose cannot be combined with --count-only or --ids-only'); + return; + } + + const chunkSize = this.chunkSize || 50; + let page = 1; + let hasMoreDatasets = true; + let totalProcessed = 0; + const updatableDatasets: Dataset[] = []; + + if (!this.countOnly && !this.idsOnly) { + logger.info(`Processing datasets in chunks of ${chunkSize}...`); + } + + while (hasMoreDatasets) { + const datasets = await this.getDatasets(page, chunkSize); + + if (datasets.length === 0) { + hasMoreDatasets = false; + break; + } + + if (!this.countOnly && !this.idsOnly) { + logger.info(`Processing chunk ${page} (${datasets.length} datasets)...`); + } + + const chunkUpdatableDatasets = await this.processChunk(datasets); + updatableDatasets.push(...chunkUpdatableDatasets); + totalProcessed += datasets.length; + + page += 1; + if (datasets.length < chunkSize) { + hasMoreDatasets = false; + } + } + + if (!this.countOnly && !this.idsOnly) { + logger.info(`Processed ${totalProcessed} datasets total, found ${updatableDatasets.length} that need updates`); + } + + if (this.countOnly) { + console.log(updatableDatasets.length); + } else if (this.idsOnly) { + updatableDatasets.forEach((dataset) => console.log(dataset.publish_id)); + } else if (this.verbose) { + await this.showVerboseOutput(updatableDatasets); + } else { + this.showSimpleOutput(updatableDatasets); + } + } + + /** + * Processes a chunk of datasets to determine which ones need DataCite updates + * + * This method handles parallel processing of datasets within a chunk, providing + * efficient error handling and filtering of results. + * + * @param datasets - Array of Dataset objects to process + * @returns Promise - Array of datasets that need updates + */ + // private async processChunk(datasets: Dataset[]): Promise { + // // Process datasets in parallel using Promise.allSettled for better error handling + // // + // // Why Promise.allSettled vs Promise.all? + // // - Promise.all fails fast: if ANY promise rejects, the entire operation fails + // // - Promise.allSettled waits for ALL promises: some can fail, others succeed + // // - This is crucial for batch processing where we don't want one bad dataset + // // to stop processing of the entire chunk + // const results = await Promise.allSettled( + // datasets.map(async (dataset) => { + // try { + // // Check if this specific dataset needs a DataCite update + // const needsUpdate = await this.shouldUpdateDataset(dataset); + + // // Return the dataset if it needs update, null if it doesn't + // // This creates a sparse array that we'll filter later + // return needsUpdate ? dataset : null; + // } catch (error) { + // // Error handling for individual dataset checks + // // + // // Log warnings only if we're not in silent modes (count-only or ids-only) + // // This prevents log spam when running automated scripts + // if (!this.countOnly && !this.idsOnly) { + // logger.warn(`Error checking dataset ${dataset.publish_id}: ${error.message}`); + // } + + // // IMPORTANT DECISION: Return the dataset anyway if we can't determine status + // // + // // Why? It's safer to include a dataset that might not need updating + // // than to miss one that actually does need updating. This follows the + // // "fail-safe" principle - if we're unsure, err on the side of caution + // return dataset; + // } + // }), + // ); + + // // Filter and extract results from Promise.allSettled response + // // + // // Promise.allSettled returns an array of objects with this structure: + // // - { status: 'fulfilled', value: T } for successful promises + // // - { status: 'rejected', reason: Error } for failed promises + // // + // // We need to: + // // 1. Only get fulfilled results (rejected ones are already handled above) + // // 2. Filter out null values (datasets that don't need updates) + // // 3. Extract the actual Dataset objects from the wrapper + // return results + // .filter( + // (result): result is PromiseFulfilledResult => + // // Type guard: only include fulfilled results that have actual values + // // This filters out: + // // - Rejected promises (shouldn't happen due to try/catch, but safety first) + // // - Fulfilled promises that returned null (datasets that don't need updates) + // result.status === 'fulfilled' && result.value !== null, + // ) + // .map((result) => result.value!); // Extract the Dataset from the wrapper + // // The ! is safe here because we filtered out null values above + // } + + private async processChunk(datasets: Dataset[]): Promise { + // Limit concurrency to avoid API flooding (e.g., max 5 at once) + const limit = pLimit(5); + + const tasks = datasets.map((dataset) => + limit(async () => { + try { + const needsUpdate = await this.shouldUpdateDataset(dataset); + return needsUpdate ? dataset : null; + } catch (error) { + if (!this.countOnly && !this.idsOnly) { + logger.warn( + `Error checking dataset ${dataset.publish_id}: ${ + error instanceof Error ? error.message : JSON.stringify(error) + }`, + ); + } + // Fail-safe: include dataset if uncertain + return dataset; + } + }), + ); + + const results = await Promise.allSettled(tasks); + + return results + .filter((result): result is PromiseFulfilledResult => result.status === 'fulfilled' && result.value !== null) + .map((result) => result.value!); + } + + private async getDatasets(page: number, chunkSize: number): Promise { + return await Dataset.query() + .orderBy('publish_id', 'asc') + .preload('identifier') + .preload('xmlCache') + .preload('titles') + .where('server_state', 'published') + .whereHas('identifier', (identifierQuery) => { + identifierQuery.where('type', 'doi'); + }) + .forPage(page, chunkSize); // Get files for the current page + } + + private async shouldUpdateDataset(dataset: Dataset): Promise { + try { + let doiIdentifier = dataset.identifier; + if (!doiIdentifier) { + await dataset.load('identifier'); + doiIdentifier = dataset.identifier; + } + + if (!doiIdentifier || doiIdentifier.type !== 'doi') { + return false; + } + + const datasetModified = + dataset.server_date_modified instanceof DateTime + ? dataset.server_date_modified + : DateTime.fromJSDate(dataset.server_date_modified); + + if (!datasetModified) { + return true; + } + + if (datasetModified > DateTime.now()) { + return false; + } + + const doiClient = new DoiClient(); + const DOI_CHECK_TIMEOUT = 300; // ms + + const doiLastModified = await Promise.race([ + doiClient.getDoiLastModified(doiIdentifier.value), + this.createTimeoutPromise(DOI_CHECK_TIMEOUT), + ]).catch(() => null); + + if (!doiLastModified) { + // If uncertain, better include dataset for update + return true; + } + + const doiModified = DateTime.fromJSDate(doiLastModified); + if (datasetModified > doiModified) { + const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds); + const toleranceSeconds = 600; + return diffInSeconds > toleranceSeconds; + } + return false; + } catch (error) { + return true; // safer: include dataset if unsure + } + } + + /** + * Create a timeout promise for API calls + */ + private createTimeoutPromise(timeoutMs: number): Promise { + return new Promise((_, reject) => { + setTimeout(() => reject(new Error(`API call timeout after ${timeoutMs}ms`)), timeoutMs); + }); + } + + private showSimpleOutput(updatableDatasets: Dataset[]): void { + if (updatableDatasets.length === 0) { + console.log('No datasets need DataCite updates.'); + return; + } + + console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`); + + updatableDatasets.forEach((dataset) => { + console.log(`publish_id ${dataset.publish_id} needs update - ${dataset.mainTitle || 'Untitled'}`); + }); + + console.log(`\nTo update these datasets, run:`); + console.log(` node ace update:datacite`); + console.log(`\nOr update specific datasets:`); + console.log(` node ace update:datacite -p `); + } + + private async showVerboseOutput(updatableDatasets: Dataset[]): Promise { + if (updatableDatasets.length === 0) { + console.log('No datasets need DataCite updates.'); + return; + } + + console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`); + + for (const dataset of updatableDatasets) { + await this.showDatasetDetails(dataset); + } + + console.log(`\nSummary: ${updatableDatasets.length} datasets need updates`); + } + + private async showDatasetDetails(dataset: Dataset): Promise { + try { + let doiIdentifier = dataset.identifier; + + if (!doiIdentifier) { + await dataset.load('identifier'); + doiIdentifier = dataset.identifier; + } + + const doiValue = doiIdentifier?.value || 'N/A'; + const datasetModified = dataset.server_date_modified; + + // Get DOI info from DataCite + const doiClient = new DoiClient(); + const doiLastModified = await doiClient.getDoiLastModified(doiValue); + const doiState = await doiClient.getDoiState(doiValue); + + console.log(`β”Œβ”€ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`); + console.log(`β”‚ Title: ${dataset.mainTitle || 'Untitled'}`); + console.log(`β”‚ DOI: ${doiValue}`); + console.log(`β”‚ DOI State: ${doiState || 'Unknown'}`); + console.log(`β”‚ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}`); + console.log(`β”‚ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}`); + console.log(`β”‚ Status: NEEDS UPDATE`); + console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`); + } catch (error) { + console.log(`β”Œβ”€ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`); + console.log(`β”‚ Title: ${dataset.mainTitle || 'Untitled'}`); + console.log(`β”‚ DOI: ${dataset.identifier?.value || 'N/A'}`); + console.log(`β”‚ Error: ${error.message}`); + console.log(`β”‚ Status: NEEDS UPDATE (Error checking)`); + console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`); + } + } +} diff --git a/commands/update_datacite.ts b/commands/update_datacite.ts index 9280f95..7ccb0f0 100644 --- a/commands/update_datacite.ts +++ b/commands/update_datacite.ts @@ -122,58 +122,53 @@ export default class UpdateDatacite extends BaseCommand { private async shouldUpdateDataset(dataset: Dataset): Promise { try { - // Check if dataset has a DOI identifier (HasOne relationship) let doiIdentifier = dataset.identifier; if (!doiIdentifier) { - // Try to load the relationship if not already loaded await dataset.load('identifier'); doiIdentifier = dataset.identifier; } if (!doiIdentifier || doiIdentifier.type !== 'doi') { - logger.warn(`Dataset ${dataset.publish_id}: No DOI identifier found`); return false; } - // Validate dataset modification date const datasetModified = dataset.server_date_modified; const now = DateTime.now(); if (!datasetModified) { - logger.error(`Dataset ${dataset.publish_id}: server_date_modified is null or undefined`); - return true; // Update anyway if modification date is missing + return true; // Update if modification date is missing } if (datasetModified > now) { - logger.error( - `Dataset ${dataset.publish_id}: server_date_modified (${datasetModified.toISO()}) is in the future! ` + - `Current time: ${now.toISO()}. This indicates a data integrity issue. Skipping update.`, - ); - return false; // Do not update when modification date is invalid + return false; // Skip invalid future dates } - // Get DOI information from DataCite using DoiClient + // Check DataCite DOI modification date const doiClient = new DoiClient(); const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value); if (!doiLastModified) { - logger.warn(`Dataset ${dataset.publish_id}: Could not retrieve DOI modification date from DataCite`); - return true; // Update anyway if we can't get DOI info + return false; // not Update if we can't get DOI info } - // Compare dataset modification date with DOI modification date const doiModified = DateTime.fromJSDate(doiLastModified); + if (datasetModified > doiModified) { + // if dataset was modified after DOI creation + // Calculate the difference in seconds + const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds); - logger.debug( - `Dataset ${dataset.publish_id}: Dataset modified: ${datasetModified.toISO()}, DOI modified: ${doiModified.toISO()}`, - ); + // Define tolerance threshold (60 seconds = 1 minute) + const toleranceSeconds = 60; - // Update if dataset was modified after the DOI record - return datasetModified > doiModified; + // Only update if the difference is greater than the tolerance + // This prevents unnecessary updates for minor timestamp differences + return diffInSeconds > toleranceSeconds; + } else { + return false; // No update needed + } } catch (error) { - logger.warn(`Error checking update status for dataset ${dataset.publish_id}: ${error.message}`); - return true; // Update anyway if we can't determine status + return false; // not update if we can't determine status or other error } } diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 8ef61c7..f932a8d 100644 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -1,47 +1,61 @@ #!/bin/bash +set -e -# # Run freshclam to update virus definitions -# freshclam +echo "Starting ClamAV services..." -# # Sleep for a few seconds to give ClamAV time to start -# sleep 5 -# # Start the ClamAV daemon -# /etc/init.d/clamav-daemon start - -# bootstrap clam av service and clam av database updater -set -m - -function process_file() { - if [[ ! -z "$1" ]]; then - local SETTING_LIST=$(echo "$1" | tr ',' '\n' | grep "^[A-Za-z][A-Za-z]*=.*$") - local SETTING - - for SETTING in ${SETTING_LIST}; do - # Remove any existing copies of this setting. We do this here so that - # settings with multiple values (e.g. ExtraDatabase) can still be added - # multiple times below - local KEY=${SETTING%%=*} - sed -i $2 -e "/^${KEY} /d" - done - - for SETTING in ${SETTING_LIST}; do - # Split on first '=' - local KEY=${SETTING%%=*} - local VALUE=${SETTING#*=} - echo "${KEY} ${VALUE}" >> "$2" - done +# Try to download database if missing +if [ ! "$(ls -A /var/lib/clamav 2>/dev/null)" ]; then + echo "Downloading ClamAV database (this may take a while)..." + + # Simple freshclam run without complex config + if sg clamav -c "freshclam --datadir=/var/lib/clamav --quiet"; then + echo "βœ“ Database downloaded successfully" + else + echo "⚠ Database download failed - creating minimal setup" + # Create a dummy file so clamd doesn't immediately fail + sg clamav -c "touch /var/lib/clamav/.dummy" fi -} +fi -# process_file "${CLAMD_SETTINGS_CSV}" /etc/clamav/clamd.conf -# process_file "${FRESHCLAM_SETTINGS_CSV}" /etc/clamav/freshclam.conf +# Start freshclam daemon for automatic updates +echo "Starting freshclam daemon for automatic updates..." +sg clamav -c "freshclam -d" & -# start in background -freshclam -d & # /etc/init.d/clamav-freshclam start & -clamd +# Start clamd in background +# Start clamd in foreground (so dumb-init can supervise it) # /etc/init.d/clamav-daemon start & -# change back to CMD of dockerfile -exec "$@" \ No newline at end of file +# Start clamd daemon in background using sg +echo "Starting ClamAV daemon..." +# sg clamav -c "clamd" & +# Use sg to run clamd with proper group permissions +# sg clamav -c "clamd" & +sg clamav -c "clamd --config-file=/etc/clamav/clamd.conf" & + + +# Give services time to start +echo "Waiting for services to initialize..." +sleep 8 + +# simple check +if pgrep clamd > /dev/null; then + echo "βœ“ ClamAV daemon is running" +else + echo "⚠ ClamAV daemon status uncertain, but continuing..." +fi + +# Check if freshclam daemon is running +if pgrep freshclam > /dev/null; then + echo "βœ“ Freshclam daemon is running" +else + echo "⚠ Freshclam daemon status uncertain, but continuing..." +fi + +# # change back to CMD of dockerfile +# exec "$@" + +echo "βœ“ ClamAV setup complete" +echo "Starting main application..." +exec dumb-init -- "$@" \ No newline at end of file diff --git a/freshclam.conf b/freshclam.conf index ee82a23..444a63f 100644 --- a/freshclam.conf +++ b/freshclam.conf @@ -1,229 +1,47 @@ ## -## Example config file for freshclam -## Please read the freshclam.conf(5) manual before editing this file. +## Container-optimized freshclam configuration ## - -# Comment or remove the line below. - -# Path to the database directory. -# WARNING: It must match clamd.conf's directive! -# Default: hardcoded (depends on installation options) +# Database directory DatabaseDirectory /var/lib/clamav -# Path to the log file (make sure it has proper permissions) -# Default: disabled +# Log to stdout for container logging # UpdateLogFile /dev/stdout -# Maximum size of the log file. -# Value of 0 disables the limit. -# You may use 'M' or 'm' for megabytes (1M = 1m = 1048576 bytes) -# and 'K' or 'k' for kilobytes (1K = 1k = 1024 bytes). -# in bytes just don't use modifiers. If LogFileMaxSize is enabled, -# log rotation (the LogRotate option) will always be enabled. -# Default: 1M -#LogFileMaxSize 2M - -# Log time with each message. -# Default: no +# Basic logging settings LogTime yes - -# Enable verbose logging. -# Default: no -LogVerbose yes - -# Use system logger (can work together with UpdateLogFile). -# Default: no +LogVerbose no LogSyslog no -# Specify the type of syslog messages - please refer to 'man syslog' -# for facility names. -# Default: LOG_LOCAL6 -#LogFacility LOG_MAIL - -# Enable log rotation. Always enabled when LogFileMaxSize is enabled. -# Default: no -#LogRotate yes - -# This option allows you to save the process identifier of the daemon -# Default: disabled -#PidFile /var/run/freshclam.pid +# PID file location PidFile /var/run/clamav/freshclam.pid -# By default when started freshclam drops privileges and switches to the -# "clamav" user. This directive allows you to change the database owner. -# Default: clamav (may depend on installation options) -DatabaseOwner node +# Database owner +DatabaseOwner clamav -# Use DNS to verify virus database version. Freshclam uses DNS TXT records -# to verify database and software versions. With this directive you can change -# the database verification domain. -# WARNING: Do not touch it unless you're configuring freshclam to use your -# own database verification domain. -# Default: current.cvd.clamav.net -#DNSDatabaseInfo current.cvd.clamav.net - -# Uncomment the following line and replace XY with your country -# code. See http://www.iana.org/cctld/cctld-whois.htm for the full list. -# You can use db.XY.ipv6.clamav.net for IPv6 connections. +# Mirror settings for Austria DatabaseMirror db.at.clamav.net - -# database.clamav.net is a round-robin record which points to our most -# reliable mirrors. It's used as a fall back in case db.XY.clamav.net is -# not working. DO NOT TOUCH the following line unless you know what you -# are doing. DatabaseMirror database.clamav.net -# How many attempts to make before giving up. -# Default: 3 (per mirror) -#MaxAttempts 5 - # With this option you can control scripted updates. It's highly recommended # to keep it enabled. # Default: yes -#ScriptedUpdates yes - -# By default freshclam will keep the local databases (.cld) uncompressed to -# make their handling faster. With this option you can enable the compression; -# the change will take effect with the next database update. -# Default: no -#CompressLocalDatabase no - -# With this option you can provide custom sources (http:// or file://) for -# database files. This option can be used multiple times. -# Default: no custom URLs -#DatabaseCustomURL http://myserver.com/mysigs.ndb -#DatabaseCustomURL file:///mnt/nfs/local.hdb - -# This option allows you to easily point freshclam to private mirrors. -# If PrivateMirror is set, freshclam does not attempt to use DNS -# to determine whether its databases are out-of-date, instead it will -# use the If-Modified-Since request or directly check the headers of the -# remote database files. For each database, freshclam first attempts -# to download the CLD file. If that fails, it tries to download the -# CVD file. This option overrides DatabaseMirror, DNSDatabaseInfo -# and ScriptedUpdates. It can be used multiple times to provide -# fall-back mirrors. -# Default: disabled -#PrivateMirror mirror1.mynetwork.com -#PrivateMirror mirror2.mynetwork.com +# Update settings +ScriptedUpdates yes # Number of database checks per day. # Default: 12 (every two hours) -#Checks 24 +Checks 12 -# Proxy settings -# Default: disabled -#HTTPProxyServer myproxy.com -#HTTPProxyPort 1234 -#HTTPProxyUsername myusername -#HTTPProxyPassword mypass - -# If your servers are behind a firewall/proxy which applies User-Agent -# filtering you can use this option to force the use of a different -# User-Agent header. -# Default: clamav/version_number -#HTTPUserAgent SomeUserAgentIdString - -# Use aaa.bbb.ccc.ddd as client address for downloading databases. Useful for -# multi-homed systems. -# Default: Use OS'es default outgoing IP address. -#LocalIPAddress aaa.bbb.ccc.ddd - -# Send the RELOAD command to clamd. -# Default: no -#NotifyClamd /path/to/clamd.conf - -# Run command after successful database update. -# Default: disabled -#OnUpdateExecute command - -# Run command when database update process fails. -# Default: disabled -#OnErrorExecute command - -# Run command when freshclam reports outdated version. -# In the command string %v will be replaced by the new version number. -# Default: disabled -#OnOutdatedExecute command - -# Don't fork into background. -# Default: no +# Don't fork (good for containers) Foreground no -# Enable debug messages in libclamav. -# Default: no -#Debug yes +# Connection timeouts +ConnectTimeout 60 +ReceiveTimeout 60 -# Timeout in seconds when connecting to database server. -# Default: 30 -#ConnectTimeout 60 +# Test databases before using them +TestDatabases yes -# Timeout in seconds when reading from database server. -# Default: 30 -#ReceiveTimeout 60 - -# With this option enabled, freshclam will attempt to load new -# databases into memory to make sure they are properly handled -# by libclamav before replacing the old ones. -# Default: yes -#TestDatabases yes - -# When enabled freshclam will submit statistics to the ClamAV Project about -# the latest virus detections in your environment. The ClamAV maintainers -# will then use this data to determine what types of malware are the most -# detected in the field and in what geographic area they are. -# Freshclam will connect to clamd in order to get recent statistics. -# Default: no -#SubmitDetectionStats /path/to/clamd.conf - -# Country of origin of malware/detection statistics (for statistical -# purposes only). The statistics collector at ClamAV.net will look up -# your IP address to determine the geographical origin of the malware -# reported by your installation. If this installation is mainly used to -# scan data which comes from a different location, please enable this -# option and enter a two-letter code (see http://www.iana.org/domains/root/db/) -# of the country of origin. -# Default: disabled -#DetectionStatsCountry country-code - -# This option enables support for our "Personal Statistics" service. -# When this option is enabled, the information on malware detected by -# your clamd installation is made available to you through our website. -# To get your HostID, log on http://www.stats.clamav.net and add a new -# host to your host list. Once you have the HostID, uncomment this option -# and paste the HostID here. As soon as your freshclam starts submitting -# information to our stats collecting service, you will be able to view -# the statistics of this clamd installation by logging into -# http://www.stats.clamav.net with the same credentials you used to -# generate the HostID. For more information refer to: -# http://www.clamav.net/documentation.html#cctts -# This feature requires SubmitDetectionStats to be enabled. -# Default: disabled -#DetectionStatsHostID unique-id - -# This option enables support for Google Safe Browsing. When activated for -# the first time, freshclam will download a new database file (safebrowsing.cvd) -# which will be automatically loaded by clamd and clamscan during the next -# reload, provided that the heuristic phishing detection is turned on. This -# database includes information about websites that may be phishing sites or -# possible sources of malware. When using this option, it's mandatory to run -# freshclam at least every 30 minutes. -# Freshclam uses the ClamAV's mirror infrastructure to distribute the -# database and its updates but all the contents are provided under Google's -# terms of use. See http://www.google.com/transparencyreport/safebrowsing -# and http://www.clamav.net/documentation.html#safebrowsing -# for more information. -# Default: disabled -#SafeBrowsing yes - -# This option enables downloading of bytecode.cvd, which includes additional -# detection mechanisms and improvements to the ClamAV engine. -# Default: enabled -#Bytecode yes - -# Download an additional 3rd party signature database distributed through -# the ClamAV mirrors. -# This option can be used multiple times. -#ExtraDatabase dbname1 -#ExtraDatabase dbname2 +# Enable bytecode signatures +Bytecode yes \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index eb6179c..f9efdac 100644 --- a/package-lock.json +++ b/package-lock.json @@ -48,7 +48,9 @@ "node-2fa": "^2.0.3", "node-exceptions": "^4.0.1", "notiwind": "^2.0.0", + "p-limit": "^7.1.1", "pg": "^8.9.0", + "pino-pretty": "^13.0.0", "qrcode": "^1.5.3", "redis": "^5.0.0", "reflect-metadata": "^0.2.1", @@ -92,7 +94,6 @@ "hot-hook": "^0.4.0", "numeral": "^2.0.6", "pinia": "^3.0.2", - "pino-pretty": "^13.0.0", "postcss-loader": "^8.1.1", "prettier": "^3.4.2", "supertest": "^6.3.3", @@ -7398,7 +7399,6 @@ "version": "4.6.3", "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-4.6.3.tgz", "integrity": "sha512-2P0p0pFGzHS5EMnhdxQi7aJN+iMheud0UhG4dlE1DLAlvL8JHjJJTX/CSm4JXwV0Ka5nGk3zC5mcb5bUQUxxMA==", - "dev": true, "license": "MIT", "engines": { "node": "*" @@ -7904,7 +7904,6 @@ "version": "1.4.5", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==", - "dev": true, "license": "MIT", "dependencies": { "once": "^1.4.0" @@ -8560,7 +8559,6 @@ "version": "3.0.2", "resolved": "https://registry.npmjs.org/fast-copy/-/fast-copy-3.0.2.tgz", "integrity": "sha512-dl0O9Vhju8IrcLndv2eU4ldt1ftXMqqfgN4H1cpmGV7P6jeB9FwpN9a2c8DPGE1Ys88rNUJVYDHq73CGAGOPfQ==", - "dev": true, "license": "MIT" }, "node_modules/fast-deep-equal": { @@ -8633,7 +8631,6 @@ "version": "2.1.1", "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.1.1.tgz", "integrity": "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA==", - "dev": true, "license": "MIT" }, "node_modules/fast-uri": { @@ -9667,7 +9664,6 @@ "version": "5.0.0", "resolved": "https://registry.npmjs.org/help-me/-/help-me-5.0.0.tgz", "integrity": "sha512-7xgomUX6ADmcYzFik0HzAxh/73YlKR9bmFzf51CZwR+b6YtzU2m0u49hQCqV6SvlqIqsaxovfwdvbnsw3b/zpg==", - "dev": true, "license": "MIT" }, "node_modules/hookable": { @@ -10432,7 +10428,6 @@ "version": "3.1.1", "resolved": "https://registry.npmjs.org/joycon/-/joycon-3.1.1.tgz", "integrity": "sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==", - "dev": true, "license": "MIT", "engines": { "node": ">=10" @@ -11159,7 +11154,6 @@ "version": "1.2.8", "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", - "dev": true, "license": "MIT", "funding": { "url": "https://github.com/sponsors/ljharb" @@ -11717,15 +11711,15 @@ } }, "node_modules/p-limit": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-4.0.0.tgz", - "integrity": "sha512-5b0R4txpzjPWVw/cXXUResoD4hb6U/x9BH08L7nw+GN1sezDzPdxeRvpc9c433fZhBan/wusjbCsqwqm4EIBIQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-7.1.1.tgz", + "integrity": "sha512-i8PyM2JnsNChVSYWLr2BAjNoLi0BAYC+wecOnZnVV+YSNJkzP7cWmvI34dk0WArWfH9KwBHNoZI3P3MppImlIA==", "license": "MIT", "dependencies": { - "yocto-queue": "^1.0.0" + "yocto-queue": "^1.2.1" }, "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -11746,6 +11740,21 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/p-locate/node_modules/p-limit": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-4.0.0.tgz", + "integrity": "sha512-5b0R4txpzjPWVw/cXXUResoD4hb6U/x9BH08L7nw+GN1sezDzPdxeRvpc9c433fZhBan/wusjbCsqwqm4EIBIQ==", + "license": "MIT", + "dependencies": { + "yocto-queue": "^1.0.0" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/p-map": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/p-map/-/p-map-7.0.3.tgz", @@ -12165,7 +12174,6 @@ "version": "13.1.1", "resolved": "https://registry.npmjs.org/pino-pretty/-/pino-pretty-13.1.1.tgz", "integrity": "sha512-TNNEOg0eA0u+/WuqH0MH0Xui7uqVk9D74ESOpjtebSQYbNWJk/dIxCXIxFsNfeN53JmtWqYHP2OrIZjT/CBEnA==", - "dev": true, "license": "MIT", "dependencies": { "colorette": "^2.0.7", @@ -12190,7 +12198,6 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-4.0.0.tgz", "integrity": "sha512-dxtLJO6sc35jWidmLxo7ij+Eg48PM/kleBsxpC8QJE0qJICe+KawkDQmvCMZUr9u7WKVHgMW6vy3fQ7zMiFZMA==", - "dev": true, "funding": [ { "type": "github", @@ -12207,7 +12214,6 @@ "version": "5.0.3", "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-5.0.3.tgz", "integrity": "sha512-1tB5mhVo7U+ETBKNf92xT4hrQa3pm0MZ0PQvuDnWgAAGHDsfp4lPSpiS6psrSiet87wyGPh9ft6wmhOMQ0hDiw==", - "dev": true, "license": "MIT", "engines": { "node": ">=14.16" @@ -12659,7 +12665,6 @@ "version": "3.0.3", "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz", "integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==", - "dev": true, "license": "MIT", "dependencies": { "end-of-stream": "^1.1.0", diff --git a/package.json b/package.json index 989a29e..49385ba 100644 --- a/package.json +++ b/package.json @@ -59,7 +59,6 @@ "hot-hook": "^0.4.0", "numeral": "^2.0.6", "pinia": "^3.0.2", - "pino-pretty": "^13.0.0", "postcss-loader": "^8.1.1", "prettier": "^3.4.2", "supertest": "^6.3.3", @@ -115,7 +114,9 @@ "node-2fa": "^2.0.3", "node-exceptions": "^4.0.1", "notiwind": "^2.0.0", + "p-limit": "^7.1.1", "pg": "^8.9.0", + "pino-pretty": "^13.0.0", "qrcode": "^1.5.3", "redis": "^5.0.0", "reflect-metadata": "^0.2.1", diff --git a/providers/vinejs_provider.ts b/providers/vinejs_provider.ts index e4aad4c..568dd3e 100644 --- a/providers/vinejs_provider.ts +++ b/providers/vinejs_provider.ts @@ -6,17 +6,16 @@ import type { ApplicationService } from '@adonisjs/core/types'; import vine, { symbols, BaseLiteralType, Vine } from '@vinejs/vine'; import type { FieldContext, FieldOptions } from '@vinejs/vine/types'; -// import type { MultipartFile, FileValidationOptions } from '@adonisjs/bodyparser/types'; import type { MultipartFile } from '@adonisjs/core/bodyparser'; import type { FileValidationOptions } from '@adonisjs/core/types/bodyparser'; import { Request, RequestValidator } from '@adonisjs/core/http'; import MimeType from '#models/mime_type'; - /** * Validation options accepted by the "file" rule */ export type FileRuleValidationOptions = Partial | ((field: FieldContext) => Partial); + /** * Extend VineJS */ @@ -25,6 +24,7 @@ declare module '@vinejs/vine' { myfile(options?: FileRuleValidationOptions): VineMultipartFile; } } + /** * Extend HTTP request class */ @@ -36,19 +36,54 @@ declare module '@adonisjs/core/http' { * Checks if the value is an instance of multipart file * from bodyparser. */ -export function isBodyParserFile(file: MultipartFile | unknown): boolean { +export function isBodyParserFile(file: MultipartFile | unknown): file is MultipartFile { return !!(file && typeof file === 'object' && 'isMultipartFile' in file); } -export async function getEnabledExtensions() { - const enabledExtensions = await MimeType.query().select('file_extension').where('enabled', true).exec(); - const extensions = enabledExtensions - .map((extension) => { - return extension.file_extension.split('|'); - }) - .flat(); - return extensions; +/** + * Cache for enabled extensions to reduce database queries + */ +let extensionsCache: string[] | null = null; +let cacheTimestamp = 0; +const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes + +/** + * Get enabled extensions with caching + */ +export async function getEnabledExtensions(): Promise { + const now = Date.now(); + + if (extensionsCache && now - cacheTimestamp < CACHE_DURATION) { + return extensionsCache; + } + + try { + const enabledExtensions = await MimeType.query().select('file_extension').where('enabled', true).exec(); + + const extensions = enabledExtensions + .map((extension) => extension.file_extension.split('|')) + .flat() + .map((ext) => ext.toLowerCase().trim()) + .filter((ext) => ext.length > 0); + + extensionsCache = [...new Set(extensions)]; // Remove duplicates + cacheTimestamp = now; + + return extensionsCache; + } catch (error) { + console.error('Error fetching enabled extensions:', error); + return extensionsCache || []; + } } + +/** + * Clear extensions cache + */ +export function clearExtensionsCache(): void { + extensionsCache = null; + cacheTimestamp = 0; +} + /** * VineJS validation rule that validates the file to be an * instance of BodyParser MultipartFile class. @@ -65,6 +100,7 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op // At this point, you can use type assertion to explicitly tell TypeScript that file is of type MultipartFile const validatedFile = file as MultipartFile; const validationOptions = typeof options === 'function' ? options(field) : options; + /** * Set size when it's defined in the options and missing * on the file instance @@ -72,30 +108,29 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op if (validatedFile.sizeLimit === undefined && validationOptions.size) { validatedFile.sizeLimit = validationOptions.size; } + /** * Set extensions when it's defined in the options and missing * on the file instance */ - // if (validatedFile.allowedExtensions === undefined && validationOptions.extnames) { - // validatedFile.allowedExtensions = validationOptions.extnames; - // } - if (validatedFile.allowedExtensions === undefined && validationOptions.extnames !== undefined) { - validatedFile.allowedExtensions = validationOptions.extnames; // await getEnabledExtensions(); - } else if (validatedFile.allowedExtensions === undefined && validationOptions.extnames === undefined) { - validatedFile.allowedExtensions = await getEnabledExtensions(); + if (validatedFile.allowedExtensions === undefined) { + if (validationOptions.extnames !== undefined) { + validatedFile.allowedExtensions = validationOptions.extnames; + } else { + validatedFile.allowedExtensions = await getEnabledExtensions(); + } } - /** - * wieder lΓΆschen - * Set extensions when it's defined in the options and missing - * on the file instance - */ - // if (file.clientNameSizeLimit === undefined && validationOptions.clientNameSizeLimit) { - // file.clientNameSizeLimit = validationOptions.clientNameSizeLimit; - // } + /** * Validate file */ - validatedFile.validate(); + try { + validatedFile.validate(); + } catch (error) { + field.report(`File validation failed: ${error.message}`, 'file.validation_error', field, validationOptions); + return; + } + /** * Report errors */ @@ -107,36 +142,37 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op const MULTIPART_FILE: typeof symbols.SUBTYPE = symbols.SUBTYPE; export class VineMultipartFile extends BaseLiteralType { - [MULTIPART_FILE]: string; - // constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions) { - // super(options, [isMultipartFile(validationOptions || {})]); - // this.validationOptions = validationOptions; - // this.#private = true; - // } - - // clone(): this { - // return new VineMultipartFile(this.validationOptions, this.cloneOptions()) as this; - // } - // #private; - // constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions, validations?: Validation[]); - // clone(): this; - - public validationOptions; + public validationOptions?: FileRuleValidationOptions; // extnames: (18) ['gpkg', 'htm', 'html', 'csv', 'txt', 'asc', 'c', 'cc', 'h', 'srt', 'tiff', 'pdf', 'png', 'zip', 'jpg', 'jpeg', 'jpe', 'xlsx'] // size: '512mb' - // public constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions, validations?: Validation[]) { public constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions) { - // super(options, validations); super(options, [isMultipartFile(validationOptions || {})]); this.validationOptions = validationOptions; } public clone(): any { - // return new VineMultipartFile(this.validationOptions, this.cloneOptions(), this.cloneValidations()); return new VineMultipartFile(this.validationOptions, this.cloneOptions()); } + + /** + * Set maximum file size + */ + public maxSize(size: string | number): this { + const newOptions = { ...this.validationOptions, size }; + return new VineMultipartFile(newOptions, this.cloneOptions()) as this; + } + + /** + * Set allowed extensions + */ + public extensions(extnames: string[]): this { + const newOptions = { ...this.validationOptions, extnames }; + return new VineMultipartFile(newOptions, this.cloneOptions()) as this; + } + + } export default class VinejsProvider { @@ -155,13 +191,8 @@ export default class VinejsProvider { /** * The container bindings have booted */ - boot(): void { - // VineString.macro('translatedLanguage', function (this: VineString, options: Options) { - // return this.use(translatedLanguageRule(options)); - // }); - - Vine.macro('myfile', function (this: Vine, options) { + Vine.macro('myfile', function (this: Vine, options?: FileRuleValidationOptions) { return new VineMultipartFile(options); }); @@ -175,6 +206,41 @@ export default class VinejsProvider { } return new RequestValidator(this.ctx).validateUsing(...args); }); + + // Ensure MIME validation macros are loaded + this.loadMimeValidationMacros(); + this.loadFileScanMacros(); + this.loadFileLengthMacros(); + } + + /** + * Load MIME validation macros - called during boot to ensure they're available + */ + private async loadMimeValidationMacros(): Promise { + try { + // Dynamically import the MIME validation rule to ensure macros are registered + await import('#start/rules/allowed_extensions_mimetypes'); + } catch (error) { + console.warn('Could not load MIME validation macros:', error); + } + } + + private async loadFileScanMacros(): Promise { + try { + // Dynamically import the MIME validation rule to ensure macros are registered + await import('#start/rules/file_scan'); + } catch (error) { + console.warn('Could not load MIME validation macros:', error); + } + } + + private async loadFileLengthMacros(): Promise { + try { + // Dynamically import the MIME validation rule to ensure macros are registered + await import('#start/rules/file_length'); + } catch (error) { + console.warn('Could not load MIME validation macros:', error); + } } /** @@ -190,5 +256,7 @@ export default class VinejsProvider { /** * Preparing to shutdown the app */ - async shutdown() {} + async shutdown() { + clearExtensionsCache(); + } }