Compare commits

...

4 commits

Author SHA1 Message Date
6757bdb77c feat: Enhance ClamAV Docker entrypoint and configuration
- Updated docker-entrypoint.sh to improve ClamAV service initialization and logging.
- Added checks for ClamAV and freshclam daemon status.
- Optimized freshclam configuration for container usage, including logging to stdout and setting database directory.
- Introduced caching mechanism for enabled file extensions in vinejs_provider.ts to reduce database queries.
- Implemented a new command to list datasets needing DataCite DOI updates, with options for verbose output, count only, and IDs only.
- Updated package dependencies to include p-limit and pino-pretty.
- finalized ace command 'detect:missing-cross-references'
2025-09-26 12:19:35 +02:00
4c8cce27da feat: Update form field labels from "Main Title Language*" to "Main Description Language*" for clarity 2025-09-19 17:21:21 +02:00
2f079e6fdd feat: Add optional ORCID identifier to dataset validation also for the subitter
npm updates
2025-09-19 16:26:01 +02:00
c049b22723 - feat: Enhance README with setup instructions, usage, and command documentation
- fix: Update API routes to include DOI URL handling and improve route organization

- chore: Add ORCID preload rule file and ensure proper registration

- docs: Add MIT License to the project for open-source compliance

- feat: Implement command to detect and fix missing dataset cross-references

- feat: Create command for updating DataCite DOI records with detailed logging and error handling

- docs: Add comprehensive documentation for dataset indexing command

- docs: Create detailed documentation for DataCite update command with usage examples and error handling
2025-09-19 14:35:23 +02:00
22 changed files with 2865 additions and 914 deletions

View file

@ -1,55 +1,61 @@
################## First Stage - Creating base #########################
# Created a variable to hold our node base image
ARG NODE_IMAGE=node:22-bookworm-slim
ARG NODE_IMAGE=node:22-trixie-slim
FROM $NODE_IMAGE AS base
# Install dumb-init and ClamAV, and perform ClamAV database update
RUN apt update \
&& apt-get install -y dumb-init clamav clamav-daemon nano \
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
dumb-init \
clamav \
clamav-daemon \
ca-certificates \
&& rm -rf /var/lib/apt/lists/* \
# Creating folders and changing ownerships
&& mkdir -p /home/node/app && chown node:node /home/node/app \
&& mkdir -p /home/node/app \
&& mkdir -p /var/lib/clamav \
&& mkdir /usr/local/share/clamav \
&& chown -R node:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav \
# permissions
&& mkdir /var/run/clamav \
&& chown node:clamav /var/run/clamav \
&& chmod 750 /var/run/clamav
# -----------------------------------------------
# --- ClamAV & FeshClam -------------------------
# -----------------------------------------------
# RUN \
# chmod 644 /etc/clamav/freshclam.conf && \
# freshclam && \
# mkdir /var/run/clamav && \
# chown -R clamav:root /var/run/clamav
&& mkdir -p /var/log/clamav \
&& mkdir -p /tmp/clamav-logs \
# Set ownership and permissions
&& chown node:node /home/node/app \
# && chown -R node:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav /var/run/clamav \
&& chown -R clamav:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav /var/run/clamav /var/log/clamav \
&& chmod 755 /tmp/clamav-logs \
&& chmod 750 /var/run/clamav \
&& chmod 755 /var/lib/clamav \
&& chmod 755 /var/log/clamav \
# Add node user to clamav group and allow sudo for clamav commands
&& usermod -a -G clamav node \
&& chmod g+w /var/run/clamav /var/lib/clamav /var/log/clamav /tmp/clamav-logs
# # initial update of av databases
# RUN freshclam
# Configure Clam AV...
COPY --chown=node:clamav ./*.conf /etc/clamav/
# Configure ClamAV - copy config files before switching user
# COPY --chown=node:clamav ./*.conf /etc/clamav/
COPY --chown=clamav:clamav ./*.conf /etc/clamav/
# Copy entrypoint script
COPY --chown=node:node docker-entrypoint.sh /home/node/app/docker-entrypoint.sh
RUN chmod +x /home/node/app/docker-entrypoint.sh
ENV TZ="Europe/Vienna"
# # permissions
# RUN mkdir /var/run/clamav && \
# chown node:clamav /var/run/clamav && \
# chmod 750 /var/run/clamav
# Setting the working directory
WORKDIR /home/node/app
# Changing the current active user to "node"
# Download initial ClamAV database as root before switching users
USER root
RUN freshclam --quiet || echo "Initial database download failed - will retry at runtime"
USER node
# initial update of av databases
RUN freshclam
# VOLUME /var/lib/clamav
COPY --chown=node:clamav docker-entrypoint.sh /home/node/app/docker-entrypoint.sh
RUN chmod +x /home/node/app/docker-entrypoint.sh
ENV TZ="Europe/Vienna"
# Initial update of AV databases (moved after USER directive)
# RUN freshclam || true
################## Second Stage - Installing dependencies ##########
@ -70,14 +76,13 @@ ENV NODE_ENV=production
# We run "node ace build" to build the app (dist folder) for production
RUN node ace build --ignore-ts-errors
# RUN node ace build --production
# RUN node ace build --ignore-ts-errors
################## Final Stage - Production #########################
# In this final stage, we will start running the application
FROM base AS production
# Here, we include all the required environment variables
# ENV NODE_ENV=production
ENV NODE_ENV=production
# ENV PORT=$PORT
# ENV HOST=0.0.0.0
@ -91,4 +96,4 @@ COPY --chown=node:node --from=build /home/node/app/build .
EXPOSE 3333
ENTRYPOINT ["/home/node/app/docker-entrypoint.sh"]
# Run the command to start the server using "dumb-init"
CMD [ "dumb-init", "node", "bin/server.js" ]
CMD [ "node", "bin/server.js" ]

22
LICENSE Normal file
View file

@ -0,0 +1,22 @@
MIT License
Copyright (c) 2025 Tethys Research Repository
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

View file

@ -30,9 +30,9 @@ export default defineConfig({
() => import('#start/rules/unique'),
() => import('#start/rules/translated_language'),
() => import('#start/rules/unique_person'),
() => import('#start/rules/file_length'),
() => import('#start/rules/file_scan'),
() => import('#start/rules/allowed_extensions_mimetypes'),
// () => import('#start/rules/file_length'),
// () => import('#start/rules/file_scan'),
// () => import('#start/rules/allowed_extensions_mimetypes'),
() => import('#start/rules/dependent_array_min_length'),
() => import('#start/rules/referenceValidation'),
() => import('#start/rules/valid_mimetype'),

View file

@ -1,23 +1,35 @@
import type { HttpContext } from '@adonisjs/core/http';
// import Person from 'App/Models/Person';
import Dataset from '#models/dataset';
import { StatusCodes } from 'http-status-codes';
// node ace make:controller Author
export default class DatasetController {
public async index({}: HttpContext) {
// Select datasets with server_state 'published' or 'deleted' and sort by the last published date
const datasets = await Dataset.query()
.where(function (query) {
query.where('server_state', 'published').orWhere('server_state', 'deleted');
})
.preload('titles')
.preload('identifier')
.orderBy('server_date_published', 'desc');
/**
* GET /api/datasets
* Find all published datasets
*/
public async index({ response }: HttpContext) {
try {
const datasets = await Dataset.query()
.where(function (query) {
query.where('server_state', 'published').orWhere('server_state', 'deleted');
})
.preload('titles')
.preload('identifier')
.orderBy('server_date_published', 'desc');
return datasets;
return response.status(StatusCodes.OK).json(datasets);
} catch (error) {
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
message: error.message || 'Some error occurred while retrieving datasets.',
});
}
}
/**
* GET /api/dataset
* Find all published datasets
*/
public async findAll({ response }: HttpContext) {
try {
const datasets = await Dataset.query()
@ -33,48 +45,142 @@ export default class DatasetController {
}
}
public async findOne({ params }: HttpContext) {
const datasets = await Dataset.query()
.where('publish_id', params.publish_id)
.preload('titles')
.preload('descriptions')
.preload('user', (builder) => {
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
})
.preload('authors', (builder) => {
builder
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
.withCount('datasets', (query) => {
query.as('datasets_count');
})
.pivotColumns(['role', 'sort_order'])
.orderBy('pivot_sort_order', 'asc');
})
.preload('contributors', (builder) => {
builder
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
.withCount('datasets', (query) => {
query.as('datasets_count');
})
.pivotColumns(['role', 'sort_order', 'contributor_type'])
.orderBy('pivot_sort_order', 'asc');
})
.preload('subjects')
.preload('coverage')
.preload('licenses')
.preload('references')
.preload('project')
.preload('referenced_by', (builder) => {
builder.preload('dataset', (builder) => {
builder.preload('identifier');
});
})
.preload('files', (builder) => {
builder.preload('hashvalues');
})
.preload('identifier')
.firstOrFail();
/**
* GET /api/dataset/:publish_id
* Find one dataset by publish_id
*/
public async findOne({ response, params }: HttpContext) {
try {
const dataset = await Dataset.query()
.where('publish_id', params.publish_id)
.preload('titles')
.preload('descriptions') // Using 'descriptions' instead of 'abstracts'
.preload('user', (builder) => {
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
})
.preload('authors', (builder) => {
builder
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
.withCount('datasets', (query) => {
query.as('datasets_count');
})
.pivotColumns(['role', 'sort_order'])
.orderBy('pivot_sort_order', 'asc');
})
.preload('contributors', (builder) => {
builder
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
.withCount('datasets', (query) => {
query.as('datasets_count');
})
.pivotColumns(['role', 'sort_order', 'contributor_type'])
.orderBy('pivot_sort_order', 'asc');
})
.preload('subjects')
.preload('coverage')
.preload('licenses')
.preload('references')
.preload('project')
.preload('referenced_by', (builder) => {
builder.preload('dataset', (builder) => {
builder.preload('identifier');
});
})
.preload('files', (builder) => {
builder.preload('hashvalues');
})
.preload('identifier')
.first(); // Use first() instead of firstOrFail() to handle not found gracefully
return datasets;
if (!dataset) {
return response.status(StatusCodes.NOT_FOUND).json({
message: `Cannot find Dataset with publish_id=${params.publish_id}.`,
});
}
return response.status(StatusCodes.OK).json(dataset);
} catch (error) {
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
message: error.message || `Error retrieving Dataset with publish_id=${params.publish_id}.`,
});
}
}
/**
* GET /:prefix/:value
* Find dataset by identifier (e.g., https://doi.tethys.at/10.24341/tethys.99.2)
*/
public async findByIdentifier({ response, params }: HttpContext) {
const identifierValue = `${params.prefix}/${params.value}`;
// Optional: Validate DOI format
if (!identifierValue.match(/^10\.\d+\/[a-zA-Z0-9._-]+\.[0-9]+(?:\.[0-9]+)*$/)) {
return response.status(StatusCodes.BAD_REQUEST).json({
message: `Invalid DOI format: ${identifierValue}`,
});
}
try {
// Method 1: Using subquery with whereIn (most similar to your original)
const dataset = await Dataset.query()
// .whereIn('id', (subQuery) => {
// subQuery.select('dataset_id').from('dataset_identifiers').where('value', identifierValue);
// })
.whereHas('identifier', (builder) => {
builder.where('value', identifierValue);
})
.preload('titles')
.preload('descriptions') // Using 'descriptions' instead of 'abstracts'
.preload('user', (builder) => {
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
})
.preload('authors', (builder) => {
builder
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
.withCount('datasets', (query) => {
query.as('datasets_count');
})
.pivotColumns(['role', 'sort_order'])
.wherePivot('role', 'author')
.orderBy('pivot_sort_order', 'asc');
})
.preload('contributors', (builder) => {
builder
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
.withCount('datasets', (query) => {
query.as('datasets_count');
})
.pivotColumns(['role', 'sort_order', 'contributor_type'])
.wherePivot('role', 'contributor')
.orderBy('pivot_sort_order', 'asc');
})
.preload('subjects')
.preload('coverage')
.preload('licenses')
.preload('references')
.preload('project')
.preload('referenced_by', (builder) => {
builder.preload('dataset', (builder) => {
builder.preload('identifier');
});
})
.preload('files', (builder) => {
builder.preload('hashvalues');
})
.preload('identifier')
.first();
if (!dataset) {
return response.status(StatusCodes.NOT_FOUND).json({
message: `Cannot find Dataset with identifier=${identifierValue}.`,
});
}
return response.status(StatusCodes.OK).json(dataset);
} catch (error) {
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
message: error.message || `Error retrieving Dataset with identifier=${identifierValue}.`,
});
}
}
}

View file

@ -235,6 +235,7 @@ export default class DatasetController {
.isUniquePerson({ table: 'persons', column: 'email', idField: 'id' }),
first_name: vine.string().trim().minLength(3).maxLength(255).optional().requiredWhen('name_type', '=', 'Personal'),
last_name: vine.string().trim().minLength(3).maxLength(255),
identifier_orcid: vine.string().trim().maxLength(255).orcid().optional(),
}),
)
.minLength(1)
@ -251,6 +252,7 @@ export default class DatasetController {
.isUniquePerson({ table: 'persons', column: 'email', idField: 'id' }),
first_name: vine.string().trim().minLength(3).maxLength(255).optional().requiredWhen('name_type', '=', 'Personal'),
last_name: vine.string().trim().minLength(3).maxLength(255),
identifier_orcid: vine.string().trim().maxLength(255).orcid().optional(),
pivot_contributor_type: vine.enum(Object.keys(ContributorTypes)),
}),
)
@ -326,6 +328,7 @@ export default class DatasetController {
.isUniquePerson({ table: 'persons', column: 'email', idField: 'id' }),
first_name: vine.string().trim().minLength(3).maxLength(255).optional().requiredWhen('name_type', '=', 'Personal'),
last_name: vine.string().trim().minLength(3).maxLength(255),
identifier_orcid: vine.string().trim().maxLength(255).orcid().optional(),
}),
)
.minLength(1)
@ -342,6 +345,7 @@ export default class DatasetController {
.isUniquePerson({ table: 'persons', column: 'email', idField: 'id' }),
first_name: vine.string().trim().minLength(3).maxLength(255).optional().requiredWhen('name_type', '=', 'Personal'),
last_name: vine.string().trim().minLength(3).maxLength(255),
identifier_orcid: vine.string().trim().maxLength(255).orcid().optional(),
pivot_contributor_type: vine.enum(Object.keys(ContributorTypes)),
}),
)

View file

@ -1,6 +1,3 @@
// import { Client } from 'guzzle';
// import { Log } from '@adonisjs/core/build/standalone';
// import { DoiInterface } from './interfaces/DoiInterface';
import DoiClientContract from '#app/Library/Doi/DoiClientContract';
import DoiClientException from '#app/exceptions/DoiClientException';
import { StatusCodes } from 'http-status-codes';
@ -12,14 +9,14 @@ export class DoiClient implements DoiClientContract {
public username: string;
public password: string;
public serviceUrl: string;
public apiUrl: string;
constructor() {
// const datacite_environment = process.env.DATACITE_ENVIRONMENT || 'debug';
this.username = process.env.DATACITE_USERNAME || '';
this.password = process.env.DATACITE_PASSWORD || '';
this.serviceUrl = process.env.DATACITE_SERVICE_URL || '';
// this.prefix = process.env.DATACITE_PREFIX || '';
// this.base_domain = process.env.BASE_DOMAIN || '';
this.apiUrl = process.env.DATACITE_API_URL || 'https://api.datacite.org';
if (this.username === '' || this.password === '' || this.serviceUrl === '') {
const message = 'issing configuration settings to properly initialize DOI client';
@ -90,4 +87,240 @@ export class DoiClient implements DoiClientContract {
throw new DoiClientException(error.response.status, error.response.data);
}
}
/**
* Retrieves DOI information from DataCite REST API
*
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
* @returns Promise with DOI information or null if not found
*/
public async getDoiInfo(doiValue: string): Promise<any | null> {
try {
// Use configurable DataCite REST API URL
const dataciteApiUrl = `${this.apiUrl}/dois/${doiValue}`;
const response = await axios.get(dataciteApiUrl, {
headers: {
Accept: 'application/vnd.api+json',
},
});
if (response.status === 200 && response.data.data) {
return {
created: response.data.data.attributes.created,
registered: response.data.data.attributes.registered,
updated: response.data.data.attributes.updated,
published: response.data.data.attributes.published,
state: response.data.data.attributes.state,
url: response.data.data.attributes.url,
metadata: response.data.data.attributes,
};
}
} catch (error) {
if (error.response?.status === 404) {
logger.debug(`DOI ${doiValue} not found in DataCite`);
return null;
}
logger.debug(`DataCite REST API failed for ${doiValue}: ${error.message}`);
// Fallback to MDS API
return await this.getDoiInfoFromMds(doiValue);
}
return null;
}
/**
* Fallback method to get DOI info from MDS API
*
* @param doiValue The DOI identifier
* @returns Promise with basic DOI information or null
*/
private async getDoiInfoFromMds(doiValue: string): Promise<any | null> {
try {
const auth = {
username: this.username,
password: this.password,
};
// Get DOI URL
const doiResponse = await axios.get(`${this.serviceUrl}/doi/${doiValue}`, { auth });
if (doiResponse.status === 200) {
// Get metadata if available
try {
const metadataResponse = await axios.get(`${this.serviceUrl}/metadata/${doiValue}`, {
auth,
headers: {
Accept: 'application/xml',
},
});
return {
url: doiResponse.data.trim(),
metadata: metadataResponse.data,
created: new Date().toISOString(), // MDS doesn't provide creation dates
registered: new Date().toISOString(), // Use current time as fallback
source: 'mds',
};
} catch (metadataError) {
// Return basic info even if metadata fetch fails
return {
url: doiResponse.data.trim(),
created: new Date().toISOString(),
registered: new Date().toISOString(),
source: 'mds',
};
}
}
} catch (error) {
if (error.response?.status === 404) {
logger.debug(`DOI ${doiValue} not found in DataCite MDS`);
return null;
}
logger.debug(`DataCite MDS API failed for ${doiValue}: ${error.message}`);
}
return null;
}
/**
* Checks if a DOI exists in DataCite
*
* @param doiValue The DOI identifier
* @returns Promise<boolean> True if DOI exists
*/
public async doiExists(doiValue: string): Promise<boolean> {
const doiInfo = await this.getDoiInfo(doiValue);
return doiInfo !== null;
}
/**
* Gets the last modification date of a DOI
*
* @param doiValue The DOI identifier
* @returns Promise<Date | null> Last modification date or creation date if never updated, null if not found
*/
public async getDoiLastModified(doiValue: string): Promise<Date | null> {
const doiInfo = await this.getDoiInfo(doiValue);
if (doiInfo) {
// Use updated date if available, otherwise fall back to created/registered date
const dateToUse = doiInfo.updated || doiInfo.registered || doiInfo.created;
if (dateToUse) {
logger.debug(
`DOI ${doiValue}: Using ${doiInfo.updated ? 'updated' : doiInfo.registered ? 'registered' : 'created'} date: ${dateToUse}`,
);
return new Date(dateToUse);
}
}
return null;
}
/**
* Makes a DOI unfindable (registered but not discoverable)
* Note: DOIs cannot be deleted, only made unfindable
* await doiClient.makeDoiUnfindable('10.21388/tethys.231');
*
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
* @returns Promise<AxiosResponse<any>> The http response
*/
public async makeDoiUnfindable(doiValue: string): Promise<AxiosResponse<any>> {
const auth = {
username: this.username,
password: this.password,
};
try {
// First, check if DOI exists
const exists = await this.doiExists(doiValue);
if (!exists) {
throw new DoiClientException(404, `DOI ${doiValue} not found`);
}
// Delete the DOI URL mapping to make it unfindable
// This removes the URL but keeps the metadata registered
const response = await axios.delete(`${this.serviceUrl}/doi/${doiValue}`, { auth });
// Response Codes for DELETE /doi/{doi}
// 200 OK: operation successful
// 401 Unauthorized: no login
// 403 Forbidden: login problem, quota exceeded
// 404 Not Found: DOI does not exist
if (response.status !== 200) {
const message = `Unexpected DataCite MDS response code ${response.status}`;
logger.error(message);
throw new DoiClientException(response.status, message);
}
logger.info(`DOI ${doiValue} successfully made unfindable`);
return response;
} catch (error) {
logger.error(`Failed to make DOI ${doiValue} unfindable: ${error.message}`);
if (error instanceof DoiClientException) {
throw error;
}
throw new DoiClientException(error.response?.status || 500, error.response?.data || error.message);
}
}
/**
* Makes a DOI findable again by re-registering the URL
* await doiClient.makeDoiFindable(
* '10.21388/tethys.231',
* 'https://doi.dev.tethys.at/10.21388/tethys.231'
* );
*
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
* @param landingPageUrl The landing page URL
* @returns Promise<AxiosResponse<any>> The http response
*/
public async makeDoiFindable(doiValue: string, landingPageUrl: string): Promise<AxiosResponse<any>> {
const auth = {
username: this.username,
password: this.password,
};
try {
// Re-register the DOI with its URL to make it findable again
const response = await axios.put(`${this.serviceUrl}/doi/${doiValue}`, `doi=${doiValue}\nurl=${landingPageUrl}`, { auth });
// Response Codes for PUT /doi/{doi}
// 201 Created: operation successful
// 400 Bad Request: request body must be exactly two lines: DOI and URL
// 401 Unauthorized: no login
// 403 Forbidden: login problem, quota exceeded
// 412 Precondition failed: metadata must be uploaded first
if (response.status !== 201) {
const message = `Unexpected DataCite MDS response code ${response.status}`;
logger.error(message);
throw new DoiClientException(response.status, message);
}
logger.info(`DOI ${doiValue} successfully made findable again`);
return response;
} catch (error) {
logger.error(`Failed to make DOI ${doiValue} findable: ${error.message}`);
if (error instanceof DoiClientException) {
throw error;
}
throw new DoiClientException(error.response?.status || 500, error.response?.data || error.message);
}
}
/**
* Gets the current state of a DOI (draft, registered, findable)
* const state = await doiClient.getDoiState('10.21388/tethys.231');
* console.log(`Current state: ${state}`); // 'findable'
*
* @param doiValue The DOI identifier
* @returns Promise<string | null> The DOI state or null if not found
*/
public async getDoiState(doiValue: string): Promise<string | null> {
const doiInfo = await this.getDoiInfo(doiValue);
return doiInfo?.state || null;
}
}

View file

@ -0,0 +1,380 @@
/*
|--------------------------------------------------------------------------
| node ace make:command fix-dataset-cross-references
| DONE: create commands/fix_dataset_cross_references.ts
|--------------------------------------------------------------------------
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import type { CommandOptions } from '@adonisjs/core/types/ace';
import { DateTime } from 'luxon';
import Dataset from '#models/dataset';
import DatasetReference from '#models/dataset_reference';
// import env from '#start/env';
interface MissingCrossReference {
sourceDatasetId: number;
targetDatasetId: number;
sourcePublishId: number | null;
targetPublishId: number | null;
sourceDoi: string | null;
targetDoi: string | null;
referenceType: string;
relation: string;
doi: string | null;
reverseRelation: string;
}
export default class DetectMissingCrossReferences extends BaseCommand {
static commandName = 'detect:missing-cross-references';
static description = 'Detect missing bidirectional cross-references between versioned datasets';
public static needsApplication = true;
@flags.boolean({ alias: 'f', description: 'Fix missing cross-references automatically' })
public fix: boolean = false;
@flags.boolean({ alias: 'v', description: 'Verbose output' })
public verbose: boolean = false;
@flags.number({ alias: 'p', description: 'Filter by specific publish_id (source or target dataset)' })
public publish_id?: number;
// example: node ace detect:missing-cross-references --verbose -p 227 //if you want to filter by specific publish_id with details
// example: node ace detect:missing-cross-references --verbose
// example: node ace detect:missing-cross-references --fix -p 227 //if you want to filter by specific publish_id and fix it
// example: node ace detect:missing-cross-references
public static options: CommandOptions = {
startApp: true,
staysAlive: false,
};
// Define the allowed relations that we want to process
private readonly ALLOWED_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf', 'IsVariantFormOf', 'IsOriginalFormOf'];
async run() {
this.logger.info('🔍 Detecting missing cross-references...');
this.logger.info(`📋 Processing only these relations: ${this.ALLOWED_RELATIONS.join(', ')}`);
if (this.publish_id) {
this.logger.info(`Filtering by publish_id: ${this.publish_id}`);
}
try {
const missingReferences = await this.findMissingCrossReferences();
if (missingReferences.length === 0) {
const filterMsg = this.publish_id ? ` for publish_id ${this.publish_id}` : '';
this.logger.success(`All cross-references are properly linked for the specified relations${filterMsg}!`);
return;
}
const filterMsg = this.publish_id ? ` (filtered by publish_id ${this.publish_id})` : '';
this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s)${filterMsg}:`);
// Show brief list if not verbose mode
if (!this.verbose) {
for (const missing of missingReferences) {
const sourceDoi = missing.sourceDoi ? ` DOI: ${missing.sourceDoi}` : '';
const targetDoi = missing.targetDoi ? ` DOI: ${missing.targetDoi}` : '';
this.logger.info(
`Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}${sourceDoi}) ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId}${targetDoi}) → missing reverse: ${missing.reverseRelation}`,
);
}
} else {
// Verbose mode - show detailed info
for (const missing of missingReferences) {
this.logger.info(
`Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
);
this.logger.info(` - Reference type: ${missing.referenceType}`);
this.logger.info(` - Relation: ${missing.relation}`);
this.logger.info(` - DOI: ${missing.doi}`);
}
}
if (this.fix) {
await this.fixMissingReferences(missingReferences);
this.logger.success('All missing cross-references have been fixed!');
} else {
if (this.verbose) {
this.printMissingReferencesList(missingReferences);
}
this.logger.info('💡 Run with --fix flag to automatically create missing cross-references');
if (this.publish_id) {
this.logger.info(`🎯 Currently filtering by publish_id: ${this.publish_id}`);
}
}
} catch (error) {
this.logger.error('Error detecting missing cross-references:', error);
process.exit(1);
}
}
private async findMissingCrossReferences(): Promise<MissingCrossReference[]> {
const missingReferences: {
sourceDatasetId: number;
targetDatasetId: number;
sourcePublishId: number | null;
targetPublishId: number | null;
sourceDoi: string | null;
targetDoi: string | null;
referenceType: string;
relation: string;
doi: string | null;
reverseRelation: string;
}[] = [];
this.logger.info('📊 Querying dataset references...');
// Find all references that point to Tethys datasets (DOI or URL containing tethys DOI)
// Only from datasets that are published AND only for allowed relations
const tethysReferencesQuery = DatasetReference.query()
.whereIn('type', ['DOI', 'URL'])
.whereIn('relation', this.ALLOWED_RELATIONS) // Only process allowed relations
.where((query) => {
query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
})
.preload('dataset', (datasetQuery) => {
datasetQuery.preload('identifier');
})
.whereHas('dataset', (datasetQuery) => {
datasetQuery.where('server_state', 'published');
});
if (typeof this.publish_id === 'number') {
tethysReferencesQuery.whereHas('dataset', (datasetQuery) => {
datasetQuery.where('publish_id', this.publish_id as number);
});
}
const tethysReferences = await tethysReferencesQuery.exec();
this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets (allowed relations only)`);
let processedCount = 0;
let skippedCount = 0;
for (const reference of tethysReferences) {
processedCount++;
if (this.verbose && processedCount % 10 === 0) {
this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`);
}
// Double-check that this relation is in our allowed list (safety check)
if (!this.ALLOWED_RELATIONS.includes(reference.relation)) {
skippedCount++;
if (this.verbose) {
this.logger.info(`⏭️ Skipping relation "${reference.relation}" - not in allowed list`);
}
continue;
}
// Extract dataset publish_id from DOI or URL
const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value);
if (!targetDatasetPublish) {
if (this.verbose) {
this.logger.warning(`⚠️ Could not extract publish ID from: ${reference.value}`);
}
continue;
}
// Check if target dataset exists and is published
const targetDataset = await Dataset.query()
.where('publish_id', targetDatasetPublish)
.where('server_state', 'published')
.preload('identifier')
.first();
if (!targetDataset) {
if (this.verbose) {
this.logger.warning(`⚠️ Target dataset with publish_id ${targetDatasetPublish} not found or not published`);
}
continue;
}
// Ensure we have a valid source dataset with proper preloading
if (!reference.dataset) {
this.logger.warning(`⚠️ Source dataset ${reference.document_id} not properly loaded, skipping...`);
continue;
}
// Check if reverse reference exists
const reverseReferenceExists = await this.checkReverseReferenceExists(
targetDataset.id,
// reference.document_id,
reference.relation,
);
if (!reverseReferenceExists) {
const reverseRelation = this.getReverseRelation(reference.relation);
if (reverseRelation) {
// Only add if we have a valid reverse relation
missingReferences.push({
sourceDatasetId: reference.document_id,
targetDatasetId: targetDataset.id,
sourcePublishId: reference.dataset.publish_id || null,
targetPublishId: targetDataset.publish_id || null,
referenceType: reference.type,
relation: reference.relation,
doi: reference.value,
reverseRelation: reverseRelation,
sourceDoi: reference.dataset.identifier ? reference.dataset.identifier.value : null,
targetDoi: targetDataset.identifier ? targetDataset.identifier.value : null,
});
}
}
}
this.logger.info(`✅ Processed ${processedCount} references (${skippedCount} skipped due to relation filtering)`);
return missingReferences;
}
private extractDatasetPublishIdFromReference(value: string): number | null {
// Extract from DOI: https://doi.org/10.24341/tethys.107 -> 107
const doiMatch = value.match(/10\.24341\/tethys\.(\d+)/);
if (doiMatch) {
return parseInt(doiMatch[1]);
}
// Extract from URL: https://tethys.at/dataset/107 -> 107
const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/);
if (urlMatch) {
return parseInt(urlMatch[1]);
}
return null;
}
private async checkReverseReferenceExists(targetDatasetId: number, originalRelation: string): Promise<boolean> {
const reverseRelation = this.getReverseRelation(originalRelation);
if (!reverseRelation) {
return true; // If no reverse relation is defined, consider it as "exists" to skip processing
}
// Only check for reverse references where the source dataset is also published
const reverseReference = await DatasetReference.query()
// We don't filter by source document_id here to find any incoming reference from any published dataset
// .where('document_id', sourceDatasetId)
.where('related_document_id', targetDatasetId)
.where('relation', reverseRelation)
.first();
return !!reverseReference;
}
private getReverseRelation(relation: string): string | null {
const relationMap: Record<string, string> = {
IsNewVersionOf: 'IsPreviousVersionOf',
IsPreviousVersionOf: 'IsNewVersionOf',
IsVariantFormOf: 'IsOriginalFormOf',
IsOriginalFormOf: 'IsVariantFormOf',
};
// Only return reverse relation if it exists in our map, otherwise return null
return relationMap[relation] || null;
}
private printMissingReferencesList(missingReferences: MissingCrossReference[]) {
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
console.log('│ MISSING CROSS-REFERENCES REPORT │');
console.log('│ (Published Datasets Only - Filtered Relations) │');
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
console.log();
missingReferences.forEach((missing, index) => {
console.log(
`${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId} Identifier: ${missing.sourceDoi})
${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId} Identifier: ${missing.targetDoi})`,
);
console.log(` ├─ Current relation: "${missing.relation}"`);
console.log(` ├─ Missing reverse relation: "${missing.reverseRelation}"`);
console.log(` ├─ Reference type: ${missing.referenceType}`);
console.log(` └─ DOI/URL: ${missing.doi}`);
console.log();
});
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected │`);
console.log(`│ Processed relations: ${this.ALLOWED_RELATIONS.join(', ')}`);
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
}
private async fixMissingReferences(missingReferences: MissingCrossReference[]) {
this.logger.info('🔧 Creating missing cross-references in database...');
let fixedCount = 0;
let errorCount = 0;
for (const [index, missing] of missingReferences.entries()) {
try {
// Get both source and target datasets
const sourceDataset = await Dataset.query()
.where('id', missing.sourceDatasetId)
.where('server_state', 'published')
.preload('identifier')
.first();
const targetDataset = await Dataset.query().where('id', missing.targetDatasetId).where('server_state', 'published').first();
if (!sourceDataset) {
this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`);
errorCount++;
continue;
}
if (!targetDataset) {
this.logger.warning(`⚠️ Target dataset ${missing.targetDatasetId} not found or not published, skipping...`);
errorCount++;
continue;
}
// Create the reverse reference using the referenced_by relationship
// Example: If Dataset 297 IsNewVersionOf Dataset 144
// We create an incoming reference for Dataset 144 that shows Dataset 297 IsPreviousVersionOf it
const reverseReference = new DatasetReference();
// Don't set document_id - this creates an incoming reference via related_document_id
reverseReference.related_document_id = missing.targetDatasetId; // 144 (dataset receiving the incoming reference)
reverseReference.type = 'DOI';
reverseReference.relation = missing.reverseRelation;
// Use the source dataset's DOI for the value (what's being referenced)
if (sourceDataset.identifier?.value) {
reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`;
} else {
// Fallback to dataset URL if no DOI
reverseReference.value = `https://tethys.at/dataset/${sourceDataset.publish_id || missing.sourceDatasetId}`;
}
// Use the source dataset's main title for the label
reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`;
// Also save 'server_date_modified' on target dataset to trigger any downstream updates (e.g. search index)
targetDataset.server_date_modified = DateTime.now();
await targetDataset.save();
await reverseReference.save();
fixedCount++;
if (this.verbose) {
this.logger.info(
`✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.sourceDatasetId} -> ${missing.targetDatasetId} (${missing.reverseRelation})`,
);
} else if ((index + 1) % 10 === 0) {
this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`);
}
} catch (error) {
this.logger.error(
`❌ Error creating reverse reference for datasets ${missing.targetDatasetId} -> ${missing.sourceDatasetId}:`,
error,
);
errorCount++;
}
}
this.logger.info(`📊 Fix completed: ${fixedCount} created, ${errorCount} errors`);
}
}

View file

@ -0,0 +1,346 @@
/*
|--------------------------------------------------------------------------
| node ace make:command list-updateable-datacite
| DONE: create commands/list_updeatable_datacite.ts
|--------------------------------------------------------------------------
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import { CommandOptions } from '@adonisjs/core/types/ace';
import Dataset from '#models/dataset';
import { DoiClient } from '#app/Library/Doi/DoiClient';
import env from '#start/env';
import logger from '@adonisjs/core/services/logger';
import { DateTime } from 'luxon';
import pLimit from 'p-limit';
export default class ListUpdateableDatacite extends BaseCommand {
static commandName = 'list:updateable-datacite';
static description = 'List all datasets that need DataCite DOI updates';
public static needsApplication = true;
// private chunkSize = 100; // Set chunk size for pagination
@flags.boolean({ alias: 'v', description: 'Verbose output showing detailed information' })
public verbose: boolean = false;
@flags.boolean({ alias: 'c', description: 'Show only count of updatable datasets' })
public countOnly: boolean = false;
@flags.boolean({ alias: 'i', description: 'Show only publish IDs (useful for scripting)' })
public idsOnly: boolean = false;
@flags.number({ description: 'Chunk size for processing datasets (default: 50)' })
public chunkSize: number = 50;
//example: node ace list:updateable-datacite
//example: node ace list:updateable-datacite --verbose
//example: node ace list:updateable-datacite --count-only
//example: node ace list:updateable-datacite --ids-only
//example: node ace list:updateable-datacite --chunk-size 50
public static options: CommandOptions = {
startApp: true,
stayAlive: false,
};
async run() {
const prefix = env.get('DATACITE_PREFIX', '');
const base_domain = env.get('BASE_DOMAIN', '');
if (!prefix || !base_domain) {
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
return;
}
// Prevent conflicting flags
if ((this.verbose && this.countOnly) || (this.verbose && this.idsOnly)) {
logger.error('Flags --verbose cannot be combined with --count-only or --ids-only');
return;
}
const chunkSize = this.chunkSize || 50;
let page = 1;
let hasMoreDatasets = true;
let totalProcessed = 0;
const updatableDatasets: Dataset[] = [];
if (!this.countOnly && !this.idsOnly) {
logger.info(`Processing datasets in chunks of ${chunkSize}...`);
}
while (hasMoreDatasets) {
const datasets = await this.getDatasets(page, chunkSize);
if (datasets.length === 0) {
hasMoreDatasets = false;
break;
}
if (!this.countOnly && !this.idsOnly) {
logger.info(`Processing chunk ${page} (${datasets.length} datasets)...`);
}
const chunkUpdatableDatasets = await this.processChunk(datasets);
updatableDatasets.push(...chunkUpdatableDatasets);
totalProcessed += datasets.length;
page += 1;
if (datasets.length < chunkSize) {
hasMoreDatasets = false;
}
}
if (!this.countOnly && !this.idsOnly) {
logger.info(`Processed ${totalProcessed} datasets total, found ${updatableDatasets.length} that need updates`);
}
if (this.countOnly) {
console.log(updatableDatasets.length);
} else if (this.idsOnly) {
updatableDatasets.forEach((dataset) => console.log(dataset.publish_id));
} else if (this.verbose) {
await this.showVerboseOutput(updatableDatasets);
} else {
this.showSimpleOutput(updatableDatasets);
}
}
/**
* Processes a chunk of datasets to determine which ones need DataCite updates
*
* This method handles parallel processing of datasets within a chunk, providing
* efficient error handling and filtering of results.
*
* @param datasets - Array of Dataset objects to process
* @returns Promise<Dataset[]> - Array of datasets that need updates
*/
// private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
// // Process datasets in parallel using Promise.allSettled for better error handling
// //
// // Why Promise.allSettled vs Promise.all?
// // - Promise.all fails fast: if ANY promise rejects, the entire operation fails
// // - Promise.allSettled waits for ALL promises: some can fail, others succeed
// // - This is crucial for batch processing where we don't want one bad dataset
// // to stop processing of the entire chunk
// const results = await Promise.allSettled(
// datasets.map(async (dataset) => {
// try {
// // Check if this specific dataset needs a DataCite update
// const needsUpdate = await this.shouldUpdateDataset(dataset);
// // Return the dataset if it needs update, null if it doesn't
// // This creates a sparse array that we'll filter later
// return needsUpdate ? dataset : null;
// } catch (error) {
// // Error handling for individual dataset checks
// //
// // Log warnings only if we're not in silent modes (count-only or ids-only)
// // This prevents log spam when running automated scripts
// if (!this.countOnly && !this.idsOnly) {
// logger.warn(`Error checking dataset ${dataset.publish_id}: ${error.message}`);
// }
// // IMPORTANT DECISION: Return the dataset anyway if we can't determine status
// //
// // Why? It's safer to include a dataset that might not need updating
// // than to miss one that actually does need updating. This follows the
// // "fail-safe" principle - if we're unsure, err on the side of caution
// return dataset;
// }
// }),
// );
// // Filter and extract results from Promise.allSettled response
// //
// // Promise.allSettled returns an array of objects with this structure:
// // - { status: 'fulfilled', value: T } for successful promises
// // - { status: 'rejected', reason: Error } for failed promises
// //
// // We need to:
// // 1. Only get fulfilled results (rejected ones are already handled above)
// // 2. Filter out null values (datasets that don't need updates)
// // 3. Extract the actual Dataset objects from the wrapper
// return results
// .filter(
// (result): result is PromiseFulfilledResult<Dataset | null> =>
// // Type guard: only include fulfilled results that have actual values
// // This filters out:
// // - Rejected promises (shouldn't happen due to try/catch, but safety first)
// // - Fulfilled promises that returned null (datasets that don't need updates)
// result.status === 'fulfilled' && result.value !== null,
// )
// .map((result) => result.value!); // Extract the Dataset from the wrapper
// // The ! is safe here because we filtered out null values above
// }
private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
// Limit concurrency to avoid API flooding (e.g., max 5 at once)
const limit = pLimit(5);
const tasks = datasets.map((dataset) =>
limit(async () => {
try {
const needsUpdate = await this.shouldUpdateDataset(dataset);
return needsUpdate ? dataset : null;
} catch (error) {
if (!this.countOnly && !this.idsOnly) {
logger.warn(
`Error checking dataset ${dataset.publish_id}: ${
error instanceof Error ? error.message : JSON.stringify(error)
}`,
);
}
// Fail-safe: include dataset if uncertain
return dataset;
}
}),
);
const results = await Promise.allSettled(tasks);
return results
.filter((result): result is PromiseFulfilledResult<Dataset | null> => result.status === 'fulfilled' && result.value !== null)
.map((result) => result.value!);
}
private async getDatasets(page: number, chunkSize: number): Promise<Dataset[]> {
return await Dataset.query()
.orderBy('publish_id', 'asc')
.preload('identifier')
.preload('xmlCache')
.preload('titles')
.where('server_state', 'published')
.whereHas('identifier', (identifierQuery) => {
identifierQuery.where('type', 'doi');
})
.forPage(page, chunkSize); // Get files for the current page
}
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
try {
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
return false;
}
const datasetModified =
dataset.server_date_modified instanceof DateTime
? dataset.server_date_modified
: DateTime.fromJSDate(dataset.server_date_modified);
if (!datasetModified) {
return true;
}
if (datasetModified > DateTime.now()) {
return false;
}
const doiClient = new DoiClient();
const DOI_CHECK_TIMEOUT = 300; // ms
const doiLastModified = await Promise.race([
doiClient.getDoiLastModified(doiIdentifier.value),
this.createTimeoutPromise(DOI_CHECK_TIMEOUT),
]).catch(() => null);
if (!doiLastModified) {
// If uncertain, better include dataset for update
return true;
}
const doiModified = DateTime.fromJSDate(doiLastModified);
if (datasetModified > doiModified) {
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
const toleranceSeconds = 600;
return diffInSeconds > toleranceSeconds;
}
return false;
} catch (error) {
return true; // safer: include dataset if unsure
}
}
/**
* Create a timeout promise for API calls
*/
private createTimeoutPromise(timeoutMs: number): Promise<never> {
return new Promise((_, reject) => {
setTimeout(() => reject(new Error(`API call timeout after ${timeoutMs}ms`)), timeoutMs);
});
}
private showSimpleOutput(updatableDatasets: Dataset[]): void {
if (updatableDatasets.length === 0) {
console.log('No datasets need DataCite updates.');
return;
}
console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
updatableDatasets.forEach((dataset) => {
console.log(`publish_id ${dataset.publish_id} needs update - ${dataset.mainTitle || 'Untitled'}`);
});
console.log(`\nTo update these datasets, run:`);
console.log(` node ace update:datacite`);
console.log(`\nOr update specific datasets:`);
console.log(` node ace update:datacite -p <publish_id>`);
}
private async showVerboseOutput(updatableDatasets: Dataset[]): Promise<void> {
if (updatableDatasets.length === 0) {
console.log('No datasets need DataCite updates.');
return;
}
console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
for (const dataset of updatableDatasets) {
await this.showDatasetDetails(dataset);
}
console.log(`\nSummary: ${updatableDatasets.length} datasets need updates`);
}
private async showDatasetDetails(dataset: Dataset): Promise<void> {
try {
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
const doiValue = doiIdentifier?.value || 'N/A';
const datasetModified = dataset.server_date_modified;
// Get DOI info from DataCite
const doiClient = new DoiClient();
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
const doiState = await doiClient.getDoiState(doiValue);
console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`);
console.log(`│ DOI: ${doiValue}`);
console.log(`│ DOI State: ${doiState || 'Unknown'}`);
console.log(`│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}`);
console.log(`│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}`);
console.log(`│ Status: NEEDS UPDATE`);
console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
} catch (error) {
console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`);
console.log(`│ DOI: ${dataset.identifier?.value || 'N/A'}`);
console.log(`│ Error: ${error.message}`);
console.log(`│ Status: NEEDS UPDATE (Error checking)`);
console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
}
}
}

266
commands/update_datacite.ts Normal file
View file

@ -0,0 +1,266 @@
/*
|--------------------------------------------------------------------------
| node ace make:command update-datacite
| DONE: create commands/update_datacite.ts
|--------------------------------------------------------------------------
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import { CommandOptions } from '@adonisjs/core/types/ace';
import Dataset from '#models/dataset';
import { DoiClient } from '#app/Library/Doi/DoiClient';
import DoiClientException from '#app/exceptions/DoiClientException';
import Index from '#app/Library/Utils/Index';
import env from '#start/env';
import logger from '@adonisjs/core/services/logger';
import { DateTime } from 'luxon';
import { getDomain } from '#app/utils/utility-functions';
export default class UpdateDatacite extends BaseCommand {
static commandName = 'update:datacite';
static description = 'Update DataCite DOI records for published datasets';
public static needsApplication = true;
@flags.number({ alias: 'p', description: 'Specific publish_id to update' })
public publish_id: number;
@flags.boolean({ alias: 'f', description: 'Force update all records regardless of modification date' })
public force: boolean = false;
@flags.boolean({ alias: 'd', description: 'Dry run - show what would be updated without making changes' })
public dryRun: boolean = false;
@flags.boolean({ alias: 's', description: 'Show detailed stats for each dataset that needs updating' })
public stats: boolean = false;
//example: node ace update:datacite -p 123 --force --dry-run
public static options: CommandOptions = {
startApp: true, // Whether to boot the application before running the command
stayAlive: false, // Whether to keep the process alive after the command has executed
};
async run() {
logger.info('Starting DataCite update process...');
const prefix = env.get('DATACITE_PREFIX', '');
const base_domain = env.get('BASE_DOMAIN', '');
const apiUrl = env.get('DATACITE_API_URL', 'https://api.datacite.org');
if (!prefix || !base_domain) {
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
return;
}
logger.info(`Using DataCite API: ${apiUrl}`);
const datasets = await this.getDatasets();
logger.info(`Found ${datasets.length} datasets to process`);
let updated = 0;
let skipped = 0;
let errors = 0;
for (const dataset of datasets) {
try {
const shouldUpdate = this.force || (await this.shouldUpdateDataset(dataset));
if (this.stats) {
// Stats mode: show detailed information for datasets that need updating
if (shouldUpdate) {
await this.showDatasetStats(dataset);
updated++;
} else {
skipped++;
}
continue;
}
if (!shouldUpdate) {
logger.info(`Dataset ${dataset.publish_id}: Up to date, skipping`);
skipped++;
continue;
}
if (this.dryRun) {
logger.info(`Dataset ${dataset.publish_id}: Would update DataCite record (dry run)`);
updated++;
continue;
}
await this.updateDataciteRecord(dataset, prefix, base_domain);
logger.info(`Dataset ${dataset.publish_id}: Successfully updated DataCite record`);
updated++;
} catch (error) {
logger.error(`Dataset ${dataset.publish_id}: Failed to update - ${error.message}`);
errors++;
}
}
if (this.stats) {
logger.info(`\nDataCite Stats Summary: ${updated} datasets need updating, ${skipped} are up to date`);
} else {
logger.info(`DataCite update completed. Updated: ${updated}, Skipped: ${skipped}, Errors: ${errors}`);
}
}
private async getDatasets(): Promise<Dataset[]> {
const query = Dataset.query()
.preload('identifier')
.preload('xmlCache')
.where('server_state', 'published')
.whereHas('identifier', (identifierQuery) => {
identifierQuery.where('type', 'doi');
});
if (this.publish_id) {
query.where('publish_id', this.publish_id);
}
return await query.exec();
}
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
try {
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
return false;
}
const datasetModified = dataset.server_date_modified;
const now = DateTime.now();
if (!datasetModified) {
return true; // Update if modification date is missing
}
if (datasetModified > now) {
return false; // Skip invalid future dates
}
// Check DataCite DOI modification date
const doiClient = new DoiClient();
const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);
if (!doiLastModified) {
return false; // not Update if we can't get DOI info
}
const doiModified = DateTime.fromJSDate(doiLastModified);
if (datasetModified > doiModified) {
// if dataset was modified after DOI creation
// Calculate the difference in seconds
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
// Define tolerance threshold (60 seconds = 1 minute)
const toleranceSeconds = 60;
// Only update if the difference is greater than the tolerance
// This prevents unnecessary updates for minor timestamp differences
return diffInSeconds > toleranceSeconds;
} else {
return false; // No update needed
}
} catch (error) {
return false; // not update if we can't determine status or other error
}
}
private async updateDataciteRecord(dataset: Dataset, prefix: string, base_domain: string): Promise<void> {
try {
// Get the DOI identifier (HasOne relationship)
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
throw new Error('No DOI identifier found for dataset');
}
// Generate XML metadata
const xmlMeta = (await Index.getDoiRegisterString(dataset)) as string;
if (!xmlMeta) {
throw new Error('Failed to generate XML metadata');
}
// Construct DOI value and landing page URL
const doiValue = doiIdentifier.value; // Use existing DOI value
const landingPageUrl = `https://doi.${getDomain(base_domain)}/${doiValue}`;
// Update DataCite record
const doiClient = new DoiClient();
const dataciteResponse = await doiClient.registerDoi(doiValue, xmlMeta, landingPageUrl);
if (dataciteResponse?.status === 201) {
// // Update dataset modification date
// dataset.server_date_modified = DateTime.now();
// await dataset.save();
// // Update search index
// const index_name = 'tethys-records';
// await Index.indexDocument(dataset, index_name);
logger.debug(`Dataset ${dataset.publish_id}: DataCite record and search index updated successfully`);
} else {
throw new DoiClientException(
dataciteResponse?.status || 500,
`Unexpected DataCite response code: ${dataciteResponse?.status}`,
);
}
} catch (error) {
if (error instanceof DoiClientException) {
throw error;
}
throw new Error(`Failed to update DataCite record: ${error.message}`);
}
}
/**
* Shows detailed statistics for a dataset that needs updating
*/
private async showDatasetStats(dataset: Dataset): Promise<void> {
try {
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
const doiValue = doiIdentifier?.value || 'N/A';
const doiStatus = doiIdentifier?.status || 'N/A';
const datasetModified = dataset.server_date_modified;
// Get DOI info from DataCite
const doiClient = new DoiClient();
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
const doiState = await doiClient.getDoiState(doiValue);
console.log(`
Dataset ${dataset.publish_id}
DOI Value: ${doiValue}
DOI Status (DB): ${doiStatus}
DOI State (DataCite): ${doiState || 'Unknown'}
Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}
DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}
Needs Update: YES - Dataset newer than DOI
`);
} catch (error) {
console.log(`
Dataset ${dataset.publish_id}
DOI Value: ${dataset.identifier?.value || 'N/A'}
Error: ${error.message}
Needs Update: YES - Error checking status
`);
}
}
}

View file

@ -1,47 +1,61 @@
#!/bin/bash
set -e
# # Run freshclam to update virus definitions
# freshclam
echo "Starting ClamAV services..."
# # Sleep for a few seconds to give ClamAV time to start
# sleep 5
# # Start the ClamAV daemon
# /etc/init.d/clamav-daemon start
# bootstrap clam av service and clam av database updater
set -m
function process_file() {
if [[ ! -z "$1" ]]; then
local SETTING_LIST=$(echo "$1" | tr ',' '\n' | grep "^[A-Za-z][A-Za-z]*=.*$")
local SETTING
for SETTING in ${SETTING_LIST}; do
# Remove any existing copies of this setting. We do this here so that
# settings with multiple values (e.g. ExtraDatabase) can still be added
# multiple times below
local KEY=${SETTING%%=*}
sed -i $2 -e "/^${KEY} /d"
done
for SETTING in ${SETTING_LIST}; do
# Split on first '='
local KEY=${SETTING%%=*}
local VALUE=${SETTING#*=}
echo "${KEY} ${VALUE}" >> "$2"
done
# Try to download database if missing
if [ ! "$(ls -A /var/lib/clamav 2>/dev/null)" ]; then
echo "Downloading ClamAV database (this may take a while)..."
# Simple freshclam run without complex config
if sg clamav -c "freshclam --datadir=/var/lib/clamav --quiet"; then
echo "✓ Database downloaded successfully"
else
echo "⚠ Database download failed - creating minimal setup"
# Create a dummy file so clamd doesn't immediately fail
sg clamav -c "touch /var/lib/clamav/.dummy"
fi
}
fi
# process_file "${CLAMD_SETTINGS_CSV}" /etc/clamav/clamd.conf
# process_file "${FRESHCLAM_SETTINGS_CSV}" /etc/clamav/freshclam.conf
# Start freshclam daemon for automatic updates
echo "Starting freshclam daemon for automatic updates..."
sg clamav -c "freshclam -d" &
# start in background
freshclam -d &
# /etc/init.d/clamav-freshclam start &
clamd
# Start clamd in background
# Start clamd in foreground (so dumb-init can supervise it)
# /etc/init.d/clamav-daemon start &
# change back to CMD of dockerfile
exec "$@"
# Start clamd daemon in background using sg
echo "Starting ClamAV daemon..."
# sg clamav -c "clamd" &
# Use sg to run clamd with proper group permissions
# sg clamav -c "clamd" &
sg clamav -c "clamd --config-file=/etc/clamav/clamd.conf" &
# Give services time to start
echo "Waiting for services to initialize..."
sleep 8
# simple check
if pgrep clamd > /dev/null; then
echo "✓ ClamAV daemon is running"
else
echo "⚠ ClamAV daemon status uncertain, but continuing..."
fi
# Check if freshclam daemon is running
if pgrep freshclam > /dev/null; then
echo "✓ Freshclam daemon is running"
else
echo "⚠ Freshclam daemon status uncertain, but continuing..."
fi
# # change back to CMD of dockerfile
# exec "$@"
echo "✓ ClamAV setup complete"
echo "Starting main application..."
exec dumb-init -- "$@"

View file

@ -0,0 +1,278 @@
# Dataset Indexing Command
AdonisJS Ace command for indexing and synchronizing published datasets with OpenSearch for search functionality.
## Overview
The `index:datasets` command processes published datasets and creates/updates corresponding search index documents in OpenSearch. It intelligently compares modification timestamps to only re-index datasets when necessary, optimizing performance while maintaining search index accuracy.
## Command Syntax
```bash
node ace index:datasets [options]
```
## Options
| Flag | Alias | Description |
|------|-------|-------------|
| `--publish_id <number>` | `-p` | Index a specific dataset by publish_id |
## Usage Examples
### Basic Operations
```bash
# Index all published datasets that have been modified since last indexing
node ace index:datasets
# Index a specific dataset by publish_id
node ace index:datasets --publish_id 231
node ace index:datasets -p 231
```
## How It Works
### 1. **Dataset Selection**
The command processes datasets that meet these criteria:
- `server_state = 'published'` - Only published datasets
- Has preloaded `xmlCache` relationship for metadata transformation
- Optionally filtered by specific `publish_id`
### 2. **Smart Update Detection**
For each dataset, the command:
- Checks if the dataset exists in the OpenSearch index
- Compares `server_date_modified` timestamps
- Only re-indexes if the dataset is newer than the indexed version
### 3. **Document Processing**
The indexing process involves:
1. **XML Generation**: Creates structured XML from dataset metadata
2. **XSLT Transformation**: Converts XML to JSON using Saxon-JS processor
3. **Index Update**: Updates or creates the document in OpenSearch
4. **Logging**: Records success/failure for each operation
## Index Structure
### Index Configuration
- **Index Name**: `tethys-records`
- **Document ID**: Dataset `publish_id`
- **Refresh**: `true` (immediate availability)
### Document Fields
The indexed documents contain:
- **Metadata Fields**: Title, description, authors, keywords
- **Identifiers**: DOI, publish_id, and other identifiers
- **Temporal Data**: Publication dates, coverage periods
- **Geographic Data**: Spatial coverage information
- **Technical Details**: Data formats, access information
- **Timestamps**: Creation and modification dates
## Example Output
### Successful Run
```bash
node ace index:datasets
```
```
Found 150 published datasets to process
Dataset with publish_id 231 successfully indexed
Dataset with publish_id 245 is up to date, skipping indexing
Dataset with publish_id 267 successfully indexed
An error occurred while indexing dataset with publish_id 289. Error: Invalid XML metadata
Processing completed: 148 indexed, 1 skipped, 1 error
```
### Specific Dataset
```bash
node ace index:datasets --publish_id 231
```
```
Found 1 published dataset to process
Dataset with publish_id 231 successfully indexed
Processing completed: 1 indexed, 0 skipped, 0 errors
```
## Update Logic
The command uses intelligent indexing to avoid unnecessary processing:
| Condition | Action | Reason |
|-----------|--------|--------|
| Dataset not in index | ✅ Index | New dataset needs indexing |
| Dataset newer than indexed version | ✅ Re-index | Dataset has been updated |
| Dataset same/older than indexed version | ❌ Skip | Already up to date |
| OpenSearch document check fails | ✅ Index | Better safe than sorry |
| Invalid XML metadata | ❌ Skip + Log Error | Cannot process invalid data |
### Timestamp Comparison
```typescript
// Example comparison logic
const existingModified = DateTime.fromMillis(Number(existingDoc.server_date_modified) * 1000);
const currentModified = dataset.server_date_modified;
if (currentModified <= existingModified) {
// Skip - already up to date
return false;
}
// Proceed with indexing
```
## XML Transformation Process
### 1. **XML Generation**
```xml
<?xml version="1.0" encoding="UTF-8" standalone="true"?>
<root>
<Dataset>
<!-- Dataset metadata fields -->
<title>Research Dataset Title</title>
<description>Dataset description...</description>
<!-- Additional metadata -->
</Dataset>
</root>
```
### 2. **XSLT Processing**
The command uses Saxon-JS with a compiled stylesheet (`solr.sef.json`) to transform XML to JSON:
```javascript
const result = await SaxonJS.transform({
stylesheetText: proc,
destination: 'serialized',
sourceText: xmlString,
});
```
### 3. **Final JSON Document**
```json
{
"id": "231",
"title": "Research Dataset Title",
"description": "Dataset description...",
"authors": ["Author Name"],
"server_date_modified": 1634567890,
"publish_id": 231
}
```
## Configuration Requirements
### Environment Variables
```bash
# OpenSearch Configuration
OPENSEARCH_HOST=localhost:9200
# For production:
# OPENSEARCH_HOST=your-opensearch-cluster:9200
```
### Required Files
- **XSLT Stylesheet**: `public/assets2/solr.sef.json` - Compiled Saxon-JS stylesheet for XML transformation
### Database Relationships
The command expects these model relationships:
```typescript
// Dataset model must have:
@hasOne(() => XmlCache, { foreignKey: 'dataset_id' })
public xmlCache: HasOne<typeof XmlCache>
```
## Error Handling
The command handles various error scenarios gracefully:
### Common Errors and Solutions
| Error | Cause | Solution |
|-------|-------|----------|
| `XSLT transformation failed` | Invalid XML or missing stylesheet | Check XML structure and stylesheet path |
| `OpenSearch connection error` | Service unavailable | Verify OpenSearch is running and accessible |
| `JSON parse error` | Malformed transformation result | Check XSLT stylesheet output format |
| `Missing xmlCache relationship` | Data integrity issue | Ensure xmlCache exists for dataset |
### Error Logging
```bash
# Typical error log entry
An error occurred while indexing dataset with publish_id 231.
Error: XSLT transformation failed: Invalid XML structure at line 15
```
## Performance Considerations
### Batch Processing
- Processes datasets sequentially to avoid overwhelming OpenSearch
- Each dataset is committed individually for reliability
- Failed indexing of one dataset doesn't stop processing others
### Resource Usage
- **Memory**: XML/JSON transformations require temporary memory
- **Network**: OpenSearch API calls for each dataset
- **CPU**: XSLT transformations are CPU-intensive
### Optimization Tips
```bash
# Index only recently modified datasets (run regularly)
node ace index:datasets
# Index specific datasets when needed
node ace index:datasets --publish_id 231
# Consider running during off-peak hours for large batches
```
## Integration with Other Systems
### Search Functionality
The indexed documents power:
- **Dataset Search**: Full-text search across metadata
- **Faceted Browsing**: Filter by authors, keywords, dates
- **Geographic Search**: Spatial query capabilities
- **Auto-complete**: Suggest dataset titles and keywords
### Related Commands
- [`update:datacite`](update-datacite.md) - Often run after indexing to sync DOI metadata
- **Database migrations** - May require re-indexing after schema changes
### API Integration
The indexed data is consumed by:
- **Search API**: `/api/search` endpoints
- **Browse API**: `/api/datasets` with filtering
- **Recommendations**: Related dataset suggestions
## Monitoring and Maintenance
### Regular Tasks
```bash
# Daily indexing (recommended cron job)
0 2 * * * cd /path/to/project && node ace index:datasets
# Weekly full re-index (if needed)
0 3 * * 0 cd /path/to/project && node ace index:datasets --force
```
### Health Checks
- Monitor OpenSearch cluster health
- Check for failed indexing operations in logs
- Verify search functionality is working
- Compare dataset counts between database and index
### Troubleshooting
```bash
# Check specific dataset indexing
node ace index:datasets --publish_id 231
# Verify OpenSearch connectivity
curl -X GET "localhost:9200/_cluster/health"
# Check index statistics
curl -X GET "localhost:9200/tethys-records/_stats"
```
## Best Practices
1. **Regular Scheduling**: Run the command regularly (daily) to keep the search index current
2. **Monitor Logs**: Watch for transformation errors or OpenSearch issues
3. **Backup Strategy**: Include OpenSearch indices in backup procedures
4. **Resource Management**: Monitor OpenSearch cluster resources during bulk operations
5. **Testing**: Verify search functionality after major indexing operations
6. **Coordination**: Run indexing before DataCite updates when both are needed

View file

@ -0,0 +1,216 @@
# DataCite Update Command
AdonisJS Ace command for updating DataCite DOI records for published datasets.
## Overview
The `update:datacite` command synchronizes your local dataset metadata with DataCite DOI records. It intelligently compares modification dates to only update records when necessary, reducing unnecessary API calls and maintaining data consistency.
## Command Syntax
```bash
node ace update:datacite [options]
```
## Options
| Flag | Alias | Description |
|------|-------|-------------|
| `--publish_id <number>` | `-p` | Update a specific dataset by publish_id |
| `--force` | `-f` | Force update all records regardless of modification date |
| `--dry-run` | `-d` | Preview what would be updated without making changes |
| `--stats` | `-s` | Show detailed statistics for datasets that need updating |
## Usage Examples
### Basic Operations
```bash
# Update all datasets that have been modified since their DOI was last updated
node ace update:datacite
# Update a specific dataset
node ace update:datacite --publish_id 231
node ace update:datacite -p 231
# Force update all datasets with DOIs (ignores modification dates)
node ace update:datacite --force
```
### Preview and Analysis
```bash
# Preview what would be updated (dry run)
node ace update:datacite --dry-run
# Show detailed statistics for datasets that need updating
node ace update:datacite --stats
# Show stats for a specific dataset
node ace update:datacite --stats --publish_id 231
```
### Combined Options
```bash
# Dry run for a specific dataset
node ace update:datacite --dry-run --publish_id 231
# Show stats for all datasets (including up-to-date ones)
node ace update:datacite --stats --force
```
## Command Modes
### 1. **Normal Mode** (Default)
Updates DataCite records for datasets that have been modified since their DOI was last updated.
**Example Output:**
```
Using DataCite API: https://api.test.datacite.org
Found 50 datasets to process
Dataset 231: Successfully updated DataCite record
Dataset 245: Up to date, skipping
Dataset 267: Successfully updated DataCite record
DataCite update completed. Updated: 15, Skipped: 35, Errors: 0
```
### 2. **Dry Run Mode** (`--dry-run`)
Shows what would be updated without making any changes to DataCite.
**Use Case:** Preview updates before running the actual command.
**Example Output:**
```
Dataset 231: Would update DataCite record (dry run)
Dataset 267: Would update DataCite record (dry run)
Dataset 245: Up to date, skipping
DataCite update completed. Updated: 2, Skipped: 1, Errors: 0
```
### 3. **Stats Mode** (`--stats`)
Shows detailed information for each dataset that needs updating, including why it needs updating.
**Use Case:** Debug synchronization issues, monitor dataset/DOI status, generate reports.
**Example Output:**
```
┌─ Dataset 231 ─────────────────────────────────────────────────────────
│ DOI Value: 10.21388/tethys.231
│ DOI Status (DB): findable
│ DOI State (DataCite): findable
│ Dataset Modified: 2024-09-15T10:30:00.000Z
│ DOI Modified: 2024-09-10T08:15:00.000Z
│ Needs Update: YES - Dataset newer than DOI
└───────────────────────────────────────────────────────────────────────
┌─ Dataset 267 ─────────────────────────────────────────────────────────
│ DOI Value: 10.21388/tethys.267
│ DOI Status (DB): findable
│ DOI State (DataCite): findable
│ Dataset Modified: 2024-09-18T14:20:00.000Z
│ DOI Modified: 2024-09-16T12:45:00.000Z
│ Needs Update: YES - Dataset newer than DOI
└───────────────────────────────────────────────────────────────────────
DataCite Stats Summary: 2 datasets need updating, 48 are up to date
```
## Update Logic
The command uses intelligent update detection:
1. **Compares modification dates**: Dataset `server_date_modified` vs DOI last modification date from DataCite
2. **Validates data integrity**: Checks for missing or future dates
3. **Handles API failures gracefully**: Updates anyway if DataCite info can't be retrieved
4. **Uses dual API approach**: DataCite REST API (primary) with MDS API fallback
### When Updates Happen
| Condition | Action | Reason |
|-----------|--------|--------|
| Dataset modified > DOI modified | ✅ Update | Dataset has newer changes |
| Dataset modified ≤ DOI modified | ❌ Skip | DOI is up to date |
| Dataset date in future | ❌ Skip | Invalid data, needs investigation |
| Dataset date missing | ✅ Update | Can't determine staleness |
| DataCite API error | ✅ Update | Better safe than sorry |
| `--force` flag used | ✅ Update | Override all logic |
## Environment Configuration
Required environment variables:
```bash
# DataCite Credentials
DATACITE_USERNAME=your_username
DATACITE_PASSWORD=your_password
# API Endpoints (environment-specific)
DATACITE_API_URL=https://api.test.datacite.org # Test environment
DATACITE_SERVICE_URL=https://mds.test.datacite.org # Test MDS
DATACITE_API_URL=https://api.datacite.org # Production
DATACITE_SERVICE_URL=https://mds.datacite.org # Production MDS
# Project Configuration
DATACITE_PREFIX=10.21388 # Your DOI prefix
BASE_DOMAIN=tethys.at # Your domain
```
## Error Handling
The command handles various error scenarios:
- **Invalid modification dates**: Logs errors but continues processing other datasets
- **DataCite API failures**: Falls back to MDS API, then to safe update
- **Missing DOI identifiers**: Skips datasets without DOI identifiers
- **Network issues**: Continues with next dataset after logging error
## Integration
The command integrates with:
- **Dataset Model**: Uses `server_date_modified` for change detection
- **DatasetIdentifier Model**: Reads DOI values and status
- **OpenSearch Index**: Updates search index after DataCite update
- **DoiClient**: Handles all DataCite API interactions
## Common Workflows
### Daily Maintenance
```bash
# Update any datasets modified today
node ace update:datacite
```
### Pre-Deployment Check
```bash
# Check what would be updated before deployment
node ace update:datacite --dry-run
```
### Debugging Sync Issues
```bash
# Investigate why specific dataset isn't syncing
node ace update:datacite --stats --publish_id 231
```
### Full Resync
```bash
# Force update all DOI records (use with caution)
node ace update:datacite --force
```
### Monitoring Report
```bash
# Generate sync status report
node ace update:datacite --stats > datacite-sync-report.txt
```
## Best Practices
1. **Regular Updates**: Run daily or after bulk dataset modifications
2. **Test First**: Use `--dry-run` or `--stats` before bulk operations
3. **Monitor Logs**: Check for data integrity warnings
4. **Environment Separation**: Use correct API URLs for test vs production
5. **Rate Limiting**: The command handles DataCite rate limits automatically

View file

@ -1,229 +1,47 @@
##
## Example config file for freshclam
## Please read the freshclam.conf(5) manual before editing this file.
## Container-optimized freshclam configuration
##
# Comment or remove the line below.
# Path to the database directory.
# WARNING: It must match clamd.conf's directive!
# Default: hardcoded (depends on installation options)
# Database directory
DatabaseDirectory /var/lib/clamav
# Path to the log file (make sure it has proper permissions)
# Default: disabled
# Log to stdout for container logging
# UpdateLogFile /dev/stdout
# Maximum size of the log file.
# Value of 0 disables the limit.
# You may use 'M' or 'm' for megabytes (1M = 1m = 1048576 bytes)
# and 'K' or 'k' for kilobytes (1K = 1k = 1024 bytes).
# in bytes just don't use modifiers. If LogFileMaxSize is enabled,
# log rotation (the LogRotate option) will always be enabled.
# Default: 1M
#LogFileMaxSize 2M
# Log time with each message.
# Default: no
# Basic logging settings
LogTime yes
# Enable verbose logging.
# Default: no
LogVerbose yes
# Use system logger (can work together with UpdateLogFile).
# Default: no
LogVerbose no
LogSyslog no
# Specify the type of syslog messages - please refer to 'man syslog'
# for facility names.
# Default: LOG_LOCAL6
#LogFacility LOG_MAIL
# Enable log rotation. Always enabled when LogFileMaxSize is enabled.
# Default: no
#LogRotate yes
# This option allows you to save the process identifier of the daemon
# Default: disabled
#PidFile /var/run/freshclam.pid
# PID file location
PidFile /var/run/clamav/freshclam.pid
# By default when started freshclam drops privileges and switches to the
# "clamav" user. This directive allows you to change the database owner.
# Default: clamav (may depend on installation options)
DatabaseOwner node
# Database owner
DatabaseOwner clamav
# Use DNS to verify virus database version. Freshclam uses DNS TXT records
# to verify database and software versions. With this directive you can change
# the database verification domain.
# WARNING: Do not touch it unless you're configuring freshclam to use your
# own database verification domain.
# Default: current.cvd.clamav.net
#DNSDatabaseInfo current.cvd.clamav.net
# Uncomment the following line and replace XY with your country
# code. See http://www.iana.org/cctld/cctld-whois.htm for the full list.
# You can use db.XY.ipv6.clamav.net for IPv6 connections.
# Mirror settings for Austria
DatabaseMirror db.at.clamav.net
# database.clamav.net is a round-robin record which points to our most
# reliable mirrors. It's used as a fall back in case db.XY.clamav.net is
# not working. DO NOT TOUCH the following line unless you know what you
# are doing.
DatabaseMirror database.clamav.net
# How many attempts to make before giving up.
# Default: 3 (per mirror)
#MaxAttempts 5
# With this option you can control scripted updates. It's highly recommended
# to keep it enabled.
# Default: yes
#ScriptedUpdates yes
# By default freshclam will keep the local databases (.cld) uncompressed to
# make their handling faster. With this option you can enable the compression;
# the change will take effect with the next database update.
# Default: no
#CompressLocalDatabase no
# With this option you can provide custom sources (http:// or file://) for
# database files. This option can be used multiple times.
# Default: no custom URLs
#DatabaseCustomURL http://myserver.com/mysigs.ndb
#DatabaseCustomURL file:///mnt/nfs/local.hdb
# This option allows you to easily point freshclam to private mirrors.
# If PrivateMirror is set, freshclam does not attempt to use DNS
# to determine whether its databases are out-of-date, instead it will
# use the If-Modified-Since request or directly check the headers of the
# remote database files. For each database, freshclam first attempts
# to download the CLD file. If that fails, it tries to download the
# CVD file. This option overrides DatabaseMirror, DNSDatabaseInfo
# and ScriptedUpdates. It can be used multiple times to provide
# fall-back mirrors.
# Default: disabled
#PrivateMirror mirror1.mynetwork.com
#PrivateMirror mirror2.mynetwork.com
# Update settings
ScriptedUpdates yes
# Number of database checks per day.
# Default: 12 (every two hours)
#Checks 24
Checks 12
# Proxy settings
# Default: disabled
#HTTPProxyServer myproxy.com
#HTTPProxyPort 1234
#HTTPProxyUsername myusername
#HTTPProxyPassword mypass
# If your servers are behind a firewall/proxy which applies User-Agent
# filtering you can use this option to force the use of a different
# User-Agent header.
# Default: clamav/version_number
#HTTPUserAgent SomeUserAgentIdString
# Use aaa.bbb.ccc.ddd as client address for downloading databases. Useful for
# multi-homed systems.
# Default: Use OS'es default outgoing IP address.
#LocalIPAddress aaa.bbb.ccc.ddd
# Send the RELOAD command to clamd.
# Default: no
#NotifyClamd /path/to/clamd.conf
# Run command after successful database update.
# Default: disabled
#OnUpdateExecute command
# Run command when database update process fails.
# Default: disabled
#OnErrorExecute command
# Run command when freshclam reports outdated version.
# In the command string %v will be replaced by the new version number.
# Default: disabled
#OnOutdatedExecute command
# Don't fork into background.
# Default: no
# Don't fork (good for containers)
Foreground no
# Enable debug messages in libclamav.
# Default: no
#Debug yes
# Connection timeouts
ConnectTimeout 60
ReceiveTimeout 60
# Timeout in seconds when connecting to database server.
# Default: 30
#ConnectTimeout 60
# Test databases before using them
TestDatabases yes
# Timeout in seconds when reading from database server.
# Default: 30
#ReceiveTimeout 60
# With this option enabled, freshclam will attempt to load new
# databases into memory to make sure they are properly handled
# by libclamav before replacing the old ones.
# Default: yes
#TestDatabases yes
# When enabled freshclam will submit statistics to the ClamAV Project about
# the latest virus detections in your environment. The ClamAV maintainers
# will then use this data to determine what types of malware are the most
# detected in the field and in what geographic area they are.
# Freshclam will connect to clamd in order to get recent statistics.
# Default: no
#SubmitDetectionStats /path/to/clamd.conf
# Country of origin of malware/detection statistics (for statistical
# purposes only). The statistics collector at ClamAV.net will look up
# your IP address to determine the geographical origin of the malware
# reported by your installation. If this installation is mainly used to
# scan data which comes from a different location, please enable this
# option and enter a two-letter code (see http://www.iana.org/domains/root/db/)
# of the country of origin.
# Default: disabled
#DetectionStatsCountry country-code
# This option enables support for our "Personal Statistics" service.
# When this option is enabled, the information on malware detected by
# your clamd installation is made available to you through our website.
# To get your HostID, log on http://www.stats.clamav.net and add a new
# host to your host list. Once you have the HostID, uncomment this option
# and paste the HostID here. As soon as your freshclam starts submitting
# information to our stats collecting service, you will be able to view
# the statistics of this clamd installation by logging into
# http://www.stats.clamav.net with the same credentials you used to
# generate the HostID. For more information refer to:
# http://www.clamav.net/documentation.html#cctts
# This feature requires SubmitDetectionStats to be enabled.
# Default: disabled
#DetectionStatsHostID unique-id
# This option enables support for Google Safe Browsing. When activated for
# the first time, freshclam will download a new database file (safebrowsing.cvd)
# which will be automatically loaded by clamd and clamscan during the next
# reload, provided that the heuristic phishing detection is turned on. This
# database includes information about websites that may be phishing sites or
# possible sources of malware. When using this option, it's mandatory to run
# freshclam at least every 30 minutes.
# Freshclam uses the ClamAV's mirror infrastructure to distribute the
# database and its updates but all the contents are provided under Google's
# terms of use. See http://www.google.com/transparencyreport/safebrowsing
# and http://www.clamav.net/documentation.html#safebrowsing
# for more information.
# Default: disabled
#SafeBrowsing yes
# This option enables downloading of bytecode.cvd, which includes additional
# detection mechanisms and improvements to the ClamAV engine.
# Default: enabled
#Bytecode yes
# Download an additional 3rd party signature database distributed through
# the ClamAV mirrors.
# This option can be used multiple times.
#ExtraDatabase dbname1
#ExtraDatabase dbname2
# Enable bytecode signatures
Bytecode yes

1044
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -59,7 +59,6 @@
"hot-hook": "^0.4.0",
"numeral": "^2.0.6",
"pinia": "^3.0.2",
"pino-pretty": "^13.0.0",
"postcss-loader": "^8.1.1",
"prettier": "^3.4.2",
"supertest": "^6.3.3",
@ -115,7 +114,9 @@
"node-2fa": "^2.0.3",
"node-exceptions": "^4.0.1",
"notiwind": "^2.0.0",
"p-limit": "^7.1.1",
"pg": "^8.9.0",
"pino-pretty": "^13.0.0",
"qrcode": "^1.5.3",
"redis": "^5.0.0",
"reflect-metadata": "^0.2.1",

View file

@ -6,17 +6,16 @@
import type { ApplicationService } from '@adonisjs/core/types';
import vine, { symbols, BaseLiteralType, Vine } from '@vinejs/vine';
import type { FieldContext, FieldOptions } from '@vinejs/vine/types';
// import type { MultipartFile, FileValidationOptions } from '@adonisjs/bodyparser/types';
import type { MultipartFile } from '@adonisjs/core/bodyparser';
import type { FileValidationOptions } from '@adonisjs/core/types/bodyparser';
import { Request, RequestValidator } from '@adonisjs/core/http';
import MimeType from '#models/mime_type';
/**
* Validation options accepted by the "file" rule
*/
export type FileRuleValidationOptions = Partial<FileValidationOptions> | ((field: FieldContext) => Partial<FileValidationOptions>);
/**
* Extend VineJS
*/
@ -25,6 +24,7 @@ declare module '@vinejs/vine' {
myfile(options?: FileRuleValidationOptions): VineMultipartFile;
}
}
/**
* Extend HTTP request class
*/
@ -36,19 +36,54 @@ declare module '@adonisjs/core/http' {
* Checks if the value is an instance of multipart file
* from bodyparser.
*/
export function isBodyParserFile(file: MultipartFile | unknown): boolean {
export function isBodyParserFile(file: MultipartFile | unknown): file is MultipartFile {
return !!(file && typeof file === 'object' && 'isMultipartFile' in file);
}
export async function getEnabledExtensions() {
const enabledExtensions = await MimeType.query().select('file_extension').where('enabled', true).exec();
const extensions = enabledExtensions
.map((extension) => {
return extension.file_extension.split('|');
})
.flat();
return extensions;
/**
* Cache for enabled extensions to reduce database queries
*/
let extensionsCache: string[] | null = null;
let cacheTimestamp = 0;
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
/**
* Get enabled extensions with caching
*/
export async function getEnabledExtensions(): Promise<string[]> {
const now = Date.now();
if (extensionsCache && now - cacheTimestamp < CACHE_DURATION) {
return extensionsCache;
}
try {
const enabledExtensions = await MimeType.query().select('file_extension').where('enabled', true).exec();
const extensions = enabledExtensions
.map((extension) => extension.file_extension.split('|'))
.flat()
.map((ext) => ext.toLowerCase().trim())
.filter((ext) => ext.length > 0);
extensionsCache = [...new Set(extensions)]; // Remove duplicates
cacheTimestamp = now;
return extensionsCache;
} catch (error) {
console.error('Error fetching enabled extensions:', error);
return extensionsCache || [];
}
}
/**
* Clear extensions cache
*/
export function clearExtensionsCache(): void {
extensionsCache = null;
cacheTimestamp = 0;
}
/**
* VineJS validation rule that validates the file to be an
* instance of BodyParser MultipartFile class.
@ -65,6 +100,7 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op
// At this point, you can use type assertion to explicitly tell TypeScript that file is of type MultipartFile
const validatedFile = file as MultipartFile;
const validationOptions = typeof options === 'function' ? options(field) : options;
/**
* Set size when it's defined in the options and missing
* on the file instance
@ -72,30 +108,29 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op
if (validatedFile.sizeLimit === undefined && validationOptions.size) {
validatedFile.sizeLimit = validationOptions.size;
}
/**
* Set extensions when it's defined in the options and missing
* on the file instance
*/
// if (validatedFile.allowedExtensions === undefined && validationOptions.extnames) {
// validatedFile.allowedExtensions = validationOptions.extnames;
// }
if (validatedFile.allowedExtensions === undefined && validationOptions.extnames !== undefined) {
validatedFile.allowedExtensions = validationOptions.extnames; // await getEnabledExtensions();
} else if (validatedFile.allowedExtensions === undefined && validationOptions.extnames === undefined) {
validatedFile.allowedExtensions = await getEnabledExtensions();
if (validatedFile.allowedExtensions === undefined) {
if (validationOptions.extnames !== undefined) {
validatedFile.allowedExtensions = validationOptions.extnames;
} else {
validatedFile.allowedExtensions = await getEnabledExtensions();
}
}
/**
* wieder löschen
* Set extensions when it's defined in the options and missing
* on the file instance
*/
// if (file.clientNameSizeLimit === undefined && validationOptions.clientNameSizeLimit) {
// file.clientNameSizeLimit = validationOptions.clientNameSizeLimit;
// }
/**
* Validate file
*/
validatedFile.validate();
try {
validatedFile.validate();
} catch (error) {
field.report(`File validation failed: ${error.message}`, 'file.validation_error', field, validationOptions);
return;
}
/**
* Report errors
*/
@ -107,36 +142,37 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op
const MULTIPART_FILE: typeof symbols.SUBTYPE = symbols.SUBTYPE;
export class VineMultipartFile extends BaseLiteralType<MultipartFile, MultipartFile, MultipartFile> {
[MULTIPART_FILE]: string;
// constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions) {
// super(options, [isMultipartFile(validationOptions || {})]);
// this.validationOptions = validationOptions;
// this.#private = true;
// }
// clone(): this {
// return new VineMultipartFile(this.validationOptions, this.cloneOptions()) as this;
// }
// #private;
// constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions, validations?: Validation<any>[]);
// clone(): this;
public validationOptions;
public validationOptions?: FileRuleValidationOptions;
// extnames: (18) ['gpkg', 'htm', 'html', 'csv', 'txt', 'asc', 'c', 'cc', 'h', 'srt', 'tiff', 'pdf', 'png', 'zip', 'jpg', 'jpeg', 'jpe', 'xlsx']
// size: '512mb'
// public constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions, validations?: Validation<any>[]) {
public constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions) {
// super(options, validations);
super(options, [isMultipartFile(validationOptions || {})]);
this.validationOptions = validationOptions;
}
public clone(): any {
// return new VineMultipartFile(this.validationOptions, this.cloneOptions(), this.cloneValidations());
return new VineMultipartFile(this.validationOptions, this.cloneOptions());
}
/**
* Set maximum file size
*/
public maxSize(size: string | number): this {
const newOptions = { ...this.validationOptions, size };
return new VineMultipartFile(newOptions, this.cloneOptions()) as this;
}
/**
* Set allowed extensions
*/
public extensions(extnames: string[]): this {
const newOptions = { ...this.validationOptions, extnames };
return new VineMultipartFile(newOptions, this.cloneOptions()) as this;
}
}
export default class VinejsProvider {
@ -155,13 +191,8 @@ export default class VinejsProvider {
/**
* The container bindings have booted
*/
boot(): void {
// VineString.macro('translatedLanguage', function (this: VineString, options: Options) {
// return this.use(translatedLanguageRule(options));
// });
Vine.macro('myfile', function (this: Vine, options) {
Vine.macro('myfile', function (this: Vine, options?: FileRuleValidationOptions) {
return new VineMultipartFile(options);
});
@ -175,6 +206,41 @@ export default class VinejsProvider {
}
return new RequestValidator(this.ctx).validateUsing(...args);
});
// Ensure MIME validation macros are loaded
this.loadMimeValidationMacros();
this.loadFileScanMacros();
this.loadFileLengthMacros();
}
/**
* Load MIME validation macros - called during boot to ensure they're available
*/
private async loadMimeValidationMacros(): Promise<void> {
try {
// Dynamically import the MIME validation rule to ensure macros are registered
await import('#start/rules/allowed_extensions_mimetypes');
} catch (error) {
console.warn('Could not load MIME validation macros:', error);
}
}
private async loadFileScanMacros(): Promise<void> {
try {
// Dynamically import the MIME validation rule to ensure macros are registered
await import('#start/rules/file_scan');
} catch (error) {
console.warn('Could not load MIME validation macros:', error);
}
}
private async loadFileLengthMacros(): Promise<void> {
try {
// Dynamically import the MIME validation rule to ensure macros are registered
await import('#start/rules/file_length');
} catch (error) {
console.warn('Could not load MIME validation macros:', error);
}
}
/**
@ -190,5 +256,7 @@ export default class VinejsProvider {
/**
* Preparing to shutdown the app
*/
async shutdown() {}
async shutdown() {
clearExtensionsCache();
}
}

174
readme.md
View file

@ -11,6 +11,8 @@ Welcome to the Tethys Research Repository Backend System! This is the backend co
- [Configuration](#configuration)
- [Database](#database)
- [API Documentation](#api-documentation)
- [Commands](#commands)
- [Documentation](#documentation)
- [Contributing](#contributing)
- [License](#license)
@ -29,5 +31,175 @@ Before you begin, ensure you have met the following requirements:
1. Clone this repository:
```bash
git clone https://gitea.geologie.ac.at/geolba/tethys.backend.git
git clone git clone https://gitea.geologie.ac.at/geolba/tethys.backend.git
cd tethys-backend
```
2. Install dependencies:
```bash
npm install
```
3. Configure environment variables (see [Configuration](#configuration))
4. Run database migrations:
```bash
node ace migration:run
```
5. Start the development server:
```bash
npm run dev
```
## Usage
The Tethys Backend provides RESTful APIs for managing research datasets, user authentication, DOI registration, and search functionality.
## Configuration
Copy the `.env.example` file to `.env` and configure the following variables:
### Database Configuration
```bash
DB_CONNECTION=pg
DB_HOST=localhost
DB_PORT=5432
DB_USER=your_username
DB_PASSWORD=your_password
DB_DATABASE=tethys_db
```
### DataCite Configuration
```bash
# DataCite Credentials
DATACITE_USERNAME=your_datacite_username
DATACITE_PASSWORD=your_datacite_password
DATACITE_PREFIX=10.21388
# Environment-specific API endpoints
DATACITE_API_URL=https://api.test.datacite.org # Test environment
DATACITE_SERVICE_URL=https://mds.test.datacite.org # Test MDS
# For production:
# DATACITE_API_URL=https://api.datacite.org
# DATACITE_SERVICE_URL=https://mds.datacite.org
```
### OpenSearch Configuration
```bash
OPENSEARCH_HOST=localhost:9200
```
### Application Configuration
```bash
BASE_DOMAIN=tethys.at
APP_KEY=your_app_key
```
## Database
The system uses PostgreSQL with Lucid ORM. Key models include:
- **Dataset**: Research dataset metadata
- **DatasetIdentifier**: DOI and other identifiers for datasets
- **User**: User management and authentication
- **XmlCache**: Cached XML metadata
Run migrations and seeders:
```bash
# Run migrations
node ace migration:run
# Run seeders (if available)
node ace db:seed
```
## API Documentation
API endpoints are available for:
- Dataset management (`/api/datasets`)
- User authentication (`/api/auth`)
- DOI registration (`/api/doi`)
- Search functionality (`/api/search`)
*Detailed API documentation can be found in the `/docs/api` directory.*
## Commands
The system includes several Ace commands for maintenance and data management:
### Dataset Indexing
```bash
# Index all published datasets to OpenSearch
node ace index:datasets
# Index a specific dataset
node ace index:datasets --publish_id 123
```
### DataCite DOI Management
```bash
# Update DataCite records for modified datasets
node ace update:datacite
# Show detailed statistics for datasets needing updates
node ace update:datacite --stats
# Preview what would be updated (dry run)
node ace update:datacite --dry-run
# Force update all DOI records
node ace update:datacite --force
# Update a specific dataset
node ace update:datacite --publish_id 123
```
*For detailed command documentation, see the [Commands Documentation](docs/commands/)*
## Documentation
Comprehensive documentation is available in the `/docs` directory:
- **[Commands Documentation](docs/commands/)** - Detailed guides for Ace commands
- [DataCite Update Command](docs/commands/update-datacite.md) - DOI synchronization and management
- [Dataset Indexing Command](docs/commands/index-datasets.md) - Search index management
- **[API Documentation](docs/api/)** - REST API endpoints and usage
- **[Deployment Guide](docs/deployment/)** - Production deployment instructions
- **[Configuration Guide](docs/configuration/)** - Environment setup and configuration options
## Contributing
1. Fork the repository
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
4. Push to the branch (`git push origin feature/amazing-feature`)
5. Open a Pull Request
### Development Guidelines
- Follow the existing code style and conventions
- Write tests for new features
- Update documentation for any API changes
- Ensure all commands and migrations work properly
### Testing Commands
```bash
# Run tests
npm test
# Test specific commands
node ace update:datacite --dry-run --publish_id 123
node ace index:datasets --publish_id 123
```
## License
This project is licensed under the [MIT License](LICENSE).

View file

@ -163,7 +163,7 @@
</div>
</FormControl>
</FormField>
<FormField label="Main Title Language*" help="required: main abstract language"
<FormField label="Main Description Language*" help="required: main abstract language"
:class="{ 'text-red-400': form.errors['descriptions.0.language'] }"
class="w-full ml-1 flex-1">
<FormControl required v-model="form.descriptions[0].language" type="text"

View file

@ -725,7 +725,7 @@ Removes a selected keyword
</div>
</FormControl>
</FormField>
<FormField label="Main Title Language*" help="required: main abstract language"
<FormField label="Main Description Language*" help="required: main abstract language"
:class="{ 'text-red-400': form.errors['descriptions.0.language'] }"
class="w-full mx-2 flex-1">
<FormControl required v-model="form.descriptions[0].language" type="text"

View file

@ -272,7 +272,7 @@
</FormControl>
</FormField>
<FormField
label="Main Title Language*"
label="Main Description Language*"
help="required: main abstract language"
:class="{ 'text-red-400': form.errors['descriptions.0.language'] }"
class="w-full ml-1 flex-1"

View file

@ -8,14 +8,24 @@ import AvatarController from '#controllers/Http/Api/AvatarController';
import UserController from '#controllers/Http/Api/UserController';
import CollectionsController from '#controllers/Http/Api/collections_controller';
import { middleware } from '../kernel.js';
// API
// Clean DOI URL routes (no /api prefix)
// API routes with /api prefix
router
.group(() => {
router.get('clients', [UserController, 'getSubmitters']).as('client.index').use(middleware.auth());;
router.get('authors', [AuthorsController, 'index']).as('author.index').use(middleware.auth());;
router.get('clients', [UserController, 'getSubmitters']).as('client.index').use(middleware.auth());
router.get('authors', [AuthorsController, 'index']).as('author.index').use(middleware.auth());
router.get('datasets', [DatasetController, 'index']).as('dataset.index');
router.get('persons', [AuthorsController, 'persons']).as('author.persons');
// This should come BEFORE any other routes that might conflict
router
.get('/dataset/:prefix/:value', [DatasetController, 'findByIdentifier'])
.where('prefix', /^10\.\d+$/) // Match DOI prefix pattern (10.xxxx)
.where('value', /^[a-zA-Z0-9._-]+\.[0-9]+(?:\.[0-9]+)*$/) // Match DOI suffix pattern
.as('dataset.findByIdentifier');
router.get('/dataset', [DatasetController, 'findAll']).as('dataset.findAll');
router.get('/dataset/:publish_id', [DatasetController, 'findOne']).as('dataset.findOne');
router.get('/sitelinks/:year', [HomeController, 'findDocumentsPerYear']);
@ -35,7 +45,7 @@ router
.as('apps.twofactor_backupcodes.create')
.use(middleware.auth());
router.get('collections/:id', [CollectionsController, 'show']).as('collection.show')
router.get('collections/:id', [CollectionsController, 'show']).as('collection.show');
})
// .namespace('App/Controllers/Http/Api')
.prefix('api');

View file

@ -1,7 +1,7 @@
/*
|--------------------------------------------------------------------------
| Preloaded File - node ace make:preload rules/orcid
| Do you want to register the preload file in .adonisrc.ts file? (y/N) · true
| Do you want to register the preload file in .adonisrc.ts file? (y/N) · true
| DONE: create start/rules/orcid.ts
| DONE: update adonisrc.ts file
|--------------------------------------------------------------------------