Compare commits
4 commits
8f67839f93
...
6757bdb77c
| Author | SHA1 | Date | |
|---|---|---|---|
| 6757bdb77c | |||
| 4c8cce27da | |||
| 2f079e6fdd | |||
| c049b22723 |
22 changed files with 2865 additions and 914 deletions
77
Dockerfile
77
Dockerfile
|
|
@ -1,55 +1,61 @@
|
|||
################## First Stage - Creating base #########################
|
||||
|
||||
# Created a variable to hold our node base image
|
||||
ARG NODE_IMAGE=node:22-bookworm-slim
|
||||
ARG NODE_IMAGE=node:22-trixie-slim
|
||||
|
||||
FROM $NODE_IMAGE AS base
|
||||
|
||||
# Install dumb-init and ClamAV, and perform ClamAV database update
|
||||
RUN apt update \
|
||||
&& apt-get install -y dumb-init clamav clamav-daemon nano \
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
dumb-init \
|
||||
clamav \
|
||||
clamav-daemon \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# Creating folders and changing ownerships
|
||||
&& mkdir -p /home/node/app && chown node:node /home/node/app \
|
||||
&& mkdir -p /home/node/app \
|
||||
&& mkdir -p /var/lib/clamav \
|
||||
&& mkdir /usr/local/share/clamav \
|
||||
&& chown -R node:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav \
|
||||
# permissions
|
||||
&& mkdir /var/run/clamav \
|
||||
&& chown node:clamav /var/run/clamav \
|
||||
&& chmod 750 /var/run/clamav
|
||||
# -----------------------------------------------
|
||||
# --- ClamAV & FeshClam -------------------------
|
||||
# -----------------------------------------------
|
||||
# RUN \
|
||||
# chmod 644 /etc/clamav/freshclam.conf && \
|
||||
# freshclam && \
|
||||
# mkdir /var/run/clamav && \
|
||||
# chown -R clamav:root /var/run/clamav
|
||||
&& mkdir -p /var/log/clamav \
|
||||
&& mkdir -p /tmp/clamav-logs \
|
||||
|
||||
# Set ownership and permissions
|
||||
&& chown node:node /home/node/app \
|
||||
# && chown -R node:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav /var/run/clamav \
|
||||
&& chown -R clamav:clamav /var/lib/clamav /usr/local/share/clamav /etc/clamav /var/run/clamav /var/log/clamav \
|
||||
&& chmod 755 /tmp/clamav-logs \
|
||||
&& chmod 750 /var/run/clamav \
|
||||
&& chmod 755 /var/lib/clamav \
|
||||
&& chmod 755 /var/log/clamav \
|
||||
# Add node user to clamav group and allow sudo for clamav commands
|
||||
&& usermod -a -G clamav node \
|
||||
&& chmod g+w /var/run/clamav /var/lib/clamav /var/log/clamav /tmp/clamav-logs
|
||||
|
||||
# # initial update of av databases
|
||||
# RUN freshclam
|
||||
|
||||
# Configure Clam AV...
|
||||
COPY --chown=node:clamav ./*.conf /etc/clamav/
|
||||
# Configure ClamAV - copy config files before switching user
|
||||
# COPY --chown=node:clamav ./*.conf /etc/clamav/
|
||||
COPY --chown=clamav:clamav ./*.conf /etc/clamav/
|
||||
|
||||
# Copy entrypoint script
|
||||
COPY --chown=node:node docker-entrypoint.sh /home/node/app/docker-entrypoint.sh
|
||||
RUN chmod +x /home/node/app/docker-entrypoint.sh
|
||||
|
||||
ENV TZ="Europe/Vienna"
|
||||
|
||||
# # permissions
|
||||
# RUN mkdir /var/run/clamav && \
|
||||
# chown node:clamav /var/run/clamav && \
|
||||
# chmod 750 /var/run/clamav
|
||||
# Setting the working directory
|
||||
WORKDIR /home/node/app
|
||||
# Changing the current active user to "node"
|
||||
|
||||
# Download initial ClamAV database as root before switching users
|
||||
USER root
|
||||
RUN freshclam --quiet || echo "Initial database download failed - will retry at runtime"
|
||||
|
||||
USER node
|
||||
|
||||
# initial update of av databases
|
||||
RUN freshclam
|
||||
|
||||
# VOLUME /var/lib/clamav
|
||||
COPY --chown=node:clamav docker-entrypoint.sh /home/node/app/docker-entrypoint.sh
|
||||
RUN chmod +x /home/node/app/docker-entrypoint.sh
|
||||
ENV TZ="Europe/Vienna"
|
||||
|
||||
|
||||
# Initial update of AV databases (moved after USER directive)
|
||||
# RUN freshclam || true
|
||||
|
||||
|
||||
################## Second Stage - Installing dependencies ##########
|
||||
|
|
@ -70,14 +76,13 @@ ENV NODE_ENV=production
|
|||
# We run "node ace build" to build the app (dist folder) for production
|
||||
RUN node ace build --ignore-ts-errors
|
||||
# RUN node ace build --production
|
||||
# RUN node ace build --ignore-ts-errors
|
||||
|
||||
|
||||
################## Final Stage - Production #########################
|
||||
# In this final stage, we will start running the application
|
||||
FROM base AS production
|
||||
# Here, we include all the required environment variables
|
||||
# ENV NODE_ENV=production
|
||||
ENV NODE_ENV=production
|
||||
# ENV PORT=$PORT
|
||||
# ENV HOST=0.0.0.0
|
||||
|
||||
|
|
@ -91,4 +96,4 @@ COPY --chown=node:node --from=build /home/node/app/build .
|
|||
EXPOSE 3333
|
||||
ENTRYPOINT ["/home/node/app/docker-entrypoint.sh"]
|
||||
# Run the command to start the server using "dumb-init"
|
||||
CMD [ "dumb-init", "node", "bin/server.js" ]
|
||||
CMD [ "node", "bin/server.js" ]
|
||||
22
LICENSE
Normal file
22
LICENSE
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Tethys Research Repository
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE
|
||||
|
|
@ -30,9 +30,9 @@ export default defineConfig({
|
|||
() => import('#start/rules/unique'),
|
||||
() => import('#start/rules/translated_language'),
|
||||
() => import('#start/rules/unique_person'),
|
||||
() => import('#start/rules/file_length'),
|
||||
() => import('#start/rules/file_scan'),
|
||||
() => import('#start/rules/allowed_extensions_mimetypes'),
|
||||
// () => import('#start/rules/file_length'),
|
||||
// () => import('#start/rules/file_scan'),
|
||||
// () => import('#start/rules/allowed_extensions_mimetypes'),
|
||||
() => import('#start/rules/dependent_array_min_length'),
|
||||
() => import('#start/rules/referenceValidation'),
|
||||
() => import('#start/rules/valid_mimetype'),
|
||||
|
|
|
|||
|
|
@ -1,23 +1,35 @@
|
|||
import type { HttpContext } from '@adonisjs/core/http';
|
||||
// import Person from 'App/Models/Person';
|
||||
import Dataset from '#models/dataset';
|
||||
import { StatusCodes } from 'http-status-codes';
|
||||
|
||||
// node ace make:controller Author
|
||||
export default class DatasetController {
|
||||
public async index({}: HttpContext) {
|
||||
// Select datasets with server_state 'published' or 'deleted' and sort by the last published date
|
||||
const datasets = await Dataset.query()
|
||||
.where(function (query) {
|
||||
query.where('server_state', 'published').orWhere('server_state', 'deleted');
|
||||
})
|
||||
.preload('titles')
|
||||
.preload('identifier')
|
||||
.orderBy('server_date_published', 'desc');
|
||||
/**
|
||||
* GET /api/datasets
|
||||
* Find all published datasets
|
||||
*/
|
||||
public async index({ response }: HttpContext) {
|
||||
try {
|
||||
const datasets = await Dataset.query()
|
||||
.where(function (query) {
|
||||
query.where('server_state', 'published').orWhere('server_state', 'deleted');
|
||||
})
|
||||
.preload('titles')
|
||||
.preload('identifier')
|
||||
.orderBy('server_date_published', 'desc');
|
||||
|
||||
return datasets;
|
||||
return response.status(StatusCodes.OK).json(datasets);
|
||||
} catch (error) {
|
||||
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
|
||||
message: error.message || 'Some error occurred while retrieving datasets.',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GET /api/dataset
|
||||
* Find all published datasets
|
||||
*/
|
||||
public async findAll({ response }: HttpContext) {
|
||||
try {
|
||||
const datasets = await Dataset.query()
|
||||
|
|
@ -33,48 +45,142 @@ export default class DatasetController {
|
|||
}
|
||||
}
|
||||
|
||||
public async findOne({ params }: HttpContext) {
|
||||
const datasets = await Dataset.query()
|
||||
.where('publish_id', params.publish_id)
|
||||
.preload('titles')
|
||||
.preload('descriptions')
|
||||
.preload('user', (builder) => {
|
||||
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
|
||||
})
|
||||
.preload('authors', (builder) => {
|
||||
builder
|
||||
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||
.withCount('datasets', (query) => {
|
||||
query.as('datasets_count');
|
||||
})
|
||||
.pivotColumns(['role', 'sort_order'])
|
||||
.orderBy('pivot_sort_order', 'asc');
|
||||
})
|
||||
.preload('contributors', (builder) => {
|
||||
builder
|
||||
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||
.withCount('datasets', (query) => {
|
||||
query.as('datasets_count');
|
||||
})
|
||||
.pivotColumns(['role', 'sort_order', 'contributor_type'])
|
||||
.orderBy('pivot_sort_order', 'asc');
|
||||
})
|
||||
.preload('subjects')
|
||||
.preload('coverage')
|
||||
.preload('licenses')
|
||||
.preload('references')
|
||||
.preload('project')
|
||||
.preload('referenced_by', (builder) => {
|
||||
builder.preload('dataset', (builder) => {
|
||||
builder.preload('identifier');
|
||||
});
|
||||
})
|
||||
.preload('files', (builder) => {
|
||||
builder.preload('hashvalues');
|
||||
})
|
||||
.preload('identifier')
|
||||
.firstOrFail();
|
||||
/**
|
||||
* GET /api/dataset/:publish_id
|
||||
* Find one dataset by publish_id
|
||||
*/
|
||||
public async findOne({ response, params }: HttpContext) {
|
||||
try {
|
||||
const dataset = await Dataset.query()
|
||||
.where('publish_id', params.publish_id)
|
||||
.preload('titles')
|
||||
.preload('descriptions') // Using 'descriptions' instead of 'abstracts'
|
||||
.preload('user', (builder) => {
|
||||
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
|
||||
})
|
||||
.preload('authors', (builder) => {
|
||||
builder
|
||||
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||
.withCount('datasets', (query) => {
|
||||
query.as('datasets_count');
|
||||
})
|
||||
.pivotColumns(['role', 'sort_order'])
|
||||
.orderBy('pivot_sort_order', 'asc');
|
||||
})
|
||||
.preload('contributors', (builder) => {
|
||||
builder
|
||||
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||
.withCount('datasets', (query) => {
|
||||
query.as('datasets_count');
|
||||
})
|
||||
.pivotColumns(['role', 'sort_order', 'contributor_type'])
|
||||
.orderBy('pivot_sort_order', 'asc');
|
||||
})
|
||||
.preload('subjects')
|
||||
.preload('coverage')
|
||||
.preload('licenses')
|
||||
.preload('references')
|
||||
.preload('project')
|
||||
.preload('referenced_by', (builder) => {
|
||||
builder.preload('dataset', (builder) => {
|
||||
builder.preload('identifier');
|
||||
});
|
||||
})
|
||||
.preload('files', (builder) => {
|
||||
builder.preload('hashvalues');
|
||||
})
|
||||
.preload('identifier')
|
||||
.first(); // Use first() instead of firstOrFail() to handle not found gracefully
|
||||
|
||||
return datasets;
|
||||
if (!dataset) {
|
||||
return response.status(StatusCodes.NOT_FOUND).json({
|
||||
message: `Cannot find Dataset with publish_id=${params.publish_id}.`,
|
||||
});
|
||||
}
|
||||
|
||||
return response.status(StatusCodes.OK).json(dataset);
|
||||
} catch (error) {
|
||||
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
|
||||
message: error.message || `Error retrieving Dataset with publish_id=${params.publish_id}.`,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GET /:prefix/:value
|
||||
* Find dataset by identifier (e.g., https://doi.tethys.at/10.24341/tethys.99.2)
|
||||
*/
|
||||
public async findByIdentifier({ response, params }: HttpContext) {
|
||||
const identifierValue = `${params.prefix}/${params.value}`;
|
||||
|
||||
// Optional: Validate DOI format
|
||||
if (!identifierValue.match(/^10\.\d+\/[a-zA-Z0-9._-]+\.[0-9]+(?:\.[0-9]+)*$/)) {
|
||||
return response.status(StatusCodes.BAD_REQUEST).json({
|
||||
message: `Invalid DOI format: ${identifierValue}`,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
// Method 1: Using subquery with whereIn (most similar to your original)
|
||||
const dataset = await Dataset.query()
|
||||
// .whereIn('id', (subQuery) => {
|
||||
// subQuery.select('dataset_id').from('dataset_identifiers').where('value', identifierValue);
|
||||
// })
|
||||
.whereHas('identifier', (builder) => {
|
||||
builder.where('value', identifierValue);
|
||||
})
|
||||
.preload('titles')
|
||||
.preload('descriptions') // Using 'descriptions' instead of 'abstracts'
|
||||
.preload('user', (builder) => {
|
||||
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
|
||||
})
|
||||
.preload('authors', (builder) => {
|
||||
builder
|
||||
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||
.withCount('datasets', (query) => {
|
||||
query.as('datasets_count');
|
||||
})
|
||||
.pivotColumns(['role', 'sort_order'])
|
||||
.wherePivot('role', 'author')
|
||||
.orderBy('pivot_sort_order', 'asc');
|
||||
})
|
||||
.preload('contributors', (builder) => {
|
||||
builder
|
||||
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||
.withCount('datasets', (query) => {
|
||||
query.as('datasets_count');
|
||||
})
|
||||
.pivotColumns(['role', 'sort_order', 'contributor_type'])
|
||||
.wherePivot('role', 'contributor')
|
||||
.orderBy('pivot_sort_order', 'asc');
|
||||
})
|
||||
.preload('subjects')
|
||||
.preload('coverage')
|
||||
.preload('licenses')
|
||||
.preload('references')
|
||||
.preload('project')
|
||||
.preload('referenced_by', (builder) => {
|
||||
builder.preload('dataset', (builder) => {
|
||||
builder.preload('identifier');
|
||||
});
|
||||
})
|
||||
.preload('files', (builder) => {
|
||||
builder.preload('hashvalues');
|
||||
})
|
||||
.preload('identifier')
|
||||
.first();
|
||||
|
||||
if (!dataset) {
|
||||
return response.status(StatusCodes.NOT_FOUND).json({
|
||||
message: `Cannot find Dataset with identifier=${identifierValue}.`,
|
||||
});
|
||||
}
|
||||
|
||||
return response.status(StatusCodes.OK).json(dataset);
|
||||
} catch (error) {
|
||||
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
|
||||
message: error.message || `Error retrieving Dataset with identifier=${identifierValue}.`,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -235,6 +235,7 @@ export default class DatasetController {
|
|||
.isUniquePerson({ table: 'persons', column: 'email', idField: 'id' }),
|
||||
first_name: vine.string().trim().minLength(3).maxLength(255).optional().requiredWhen('name_type', '=', 'Personal'),
|
||||
last_name: vine.string().trim().minLength(3).maxLength(255),
|
||||
identifier_orcid: vine.string().trim().maxLength(255).orcid().optional(),
|
||||
}),
|
||||
)
|
||||
.minLength(1)
|
||||
|
|
@ -251,6 +252,7 @@ export default class DatasetController {
|
|||
.isUniquePerson({ table: 'persons', column: 'email', idField: 'id' }),
|
||||
first_name: vine.string().trim().minLength(3).maxLength(255).optional().requiredWhen('name_type', '=', 'Personal'),
|
||||
last_name: vine.string().trim().minLength(3).maxLength(255),
|
||||
identifier_orcid: vine.string().trim().maxLength(255).orcid().optional(),
|
||||
pivot_contributor_type: vine.enum(Object.keys(ContributorTypes)),
|
||||
}),
|
||||
)
|
||||
|
|
@ -326,6 +328,7 @@ export default class DatasetController {
|
|||
.isUniquePerson({ table: 'persons', column: 'email', idField: 'id' }),
|
||||
first_name: vine.string().trim().minLength(3).maxLength(255).optional().requiredWhen('name_type', '=', 'Personal'),
|
||||
last_name: vine.string().trim().minLength(3).maxLength(255),
|
||||
identifier_orcid: vine.string().trim().maxLength(255).orcid().optional(),
|
||||
}),
|
||||
)
|
||||
.minLength(1)
|
||||
|
|
@ -342,6 +345,7 @@ export default class DatasetController {
|
|||
.isUniquePerson({ table: 'persons', column: 'email', idField: 'id' }),
|
||||
first_name: vine.string().trim().minLength(3).maxLength(255).optional().requiredWhen('name_type', '=', 'Personal'),
|
||||
last_name: vine.string().trim().minLength(3).maxLength(255),
|
||||
identifier_orcid: vine.string().trim().maxLength(255).orcid().optional(),
|
||||
pivot_contributor_type: vine.enum(Object.keys(ContributorTypes)),
|
||||
}),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,3 @@
|
|||
// import { Client } from 'guzzle';
|
||||
// import { Log } from '@adonisjs/core/build/standalone';
|
||||
// import { DoiInterface } from './interfaces/DoiInterface';
|
||||
import DoiClientContract from '#app/Library/Doi/DoiClientContract';
|
||||
import DoiClientException from '#app/exceptions/DoiClientException';
|
||||
import { StatusCodes } from 'http-status-codes';
|
||||
|
|
@ -12,14 +9,14 @@ export class DoiClient implements DoiClientContract {
|
|||
public username: string;
|
||||
public password: string;
|
||||
public serviceUrl: string;
|
||||
public apiUrl: string;
|
||||
|
||||
constructor() {
|
||||
// const datacite_environment = process.env.DATACITE_ENVIRONMENT || 'debug';
|
||||
this.username = process.env.DATACITE_USERNAME || '';
|
||||
this.password = process.env.DATACITE_PASSWORD || '';
|
||||
this.serviceUrl = process.env.DATACITE_SERVICE_URL || '';
|
||||
// this.prefix = process.env.DATACITE_PREFIX || '';
|
||||
// this.base_domain = process.env.BASE_DOMAIN || '';
|
||||
this.apiUrl = process.env.DATACITE_API_URL || 'https://api.datacite.org';
|
||||
|
||||
if (this.username === '' || this.password === '' || this.serviceUrl === '') {
|
||||
const message = 'issing configuration settings to properly initialize DOI client';
|
||||
|
|
@ -90,4 +87,240 @@ export class DoiClient implements DoiClientContract {
|
|||
throw new DoiClientException(error.response.status, error.response.data);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves DOI information from DataCite REST API
|
||||
*
|
||||
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
|
||||
* @returns Promise with DOI information or null if not found
|
||||
*/
|
||||
public async getDoiInfo(doiValue: string): Promise<any | null> {
|
||||
try {
|
||||
// Use configurable DataCite REST API URL
|
||||
const dataciteApiUrl = `${this.apiUrl}/dois/${doiValue}`;
|
||||
const response = await axios.get(dataciteApiUrl, {
|
||||
headers: {
|
||||
Accept: 'application/vnd.api+json',
|
||||
},
|
||||
});
|
||||
|
||||
if (response.status === 200 && response.data.data) {
|
||||
return {
|
||||
created: response.data.data.attributes.created,
|
||||
registered: response.data.data.attributes.registered,
|
||||
updated: response.data.data.attributes.updated,
|
||||
published: response.data.data.attributes.published,
|
||||
state: response.data.data.attributes.state,
|
||||
url: response.data.data.attributes.url,
|
||||
metadata: response.data.data.attributes,
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.response?.status === 404) {
|
||||
logger.debug(`DOI ${doiValue} not found in DataCite`);
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.debug(`DataCite REST API failed for ${doiValue}: ${error.message}`);
|
||||
|
||||
// Fallback to MDS API
|
||||
return await this.getDoiInfoFromMds(doiValue);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback method to get DOI info from MDS API
|
||||
*
|
||||
* @param doiValue The DOI identifier
|
||||
* @returns Promise with basic DOI information or null
|
||||
*/
|
||||
private async getDoiInfoFromMds(doiValue: string): Promise<any | null> {
|
||||
try {
|
||||
const auth = {
|
||||
username: this.username,
|
||||
password: this.password,
|
||||
};
|
||||
|
||||
// Get DOI URL
|
||||
const doiResponse = await axios.get(`${this.serviceUrl}/doi/${doiValue}`, { auth });
|
||||
|
||||
if (doiResponse.status === 200) {
|
||||
// Get metadata if available
|
||||
try {
|
||||
const metadataResponse = await axios.get(`${this.serviceUrl}/metadata/${doiValue}`, {
|
||||
auth,
|
||||
headers: {
|
||||
Accept: 'application/xml',
|
||||
},
|
||||
});
|
||||
|
||||
return {
|
||||
url: doiResponse.data.trim(),
|
||||
metadata: metadataResponse.data,
|
||||
created: new Date().toISOString(), // MDS doesn't provide creation dates
|
||||
registered: new Date().toISOString(), // Use current time as fallback
|
||||
source: 'mds',
|
||||
};
|
||||
} catch (metadataError) {
|
||||
// Return basic info even if metadata fetch fails
|
||||
return {
|
||||
url: doiResponse.data.trim(),
|
||||
created: new Date().toISOString(),
|
||||
registered: new Date().toISOString(),
|
||||
source: 'mds',
|
||||
};
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
if (error.response?.status === 404) {
|
||||
logger.debug(`DOI ${doiValue} not found in DataCite MDS`);
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.debug(`DataCite MDS API failed for ${doiValue}: ${error.message}`);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a DOI exists in DataCite
|
||||
*
|
||||
* @param doiValue The DOI identifier
|
||||
* @returns Promise<boolean> True if DOI exists
|
||||
*/
|
||||
public async doiExists(doiValue: string): Promise<boolean> {
|
||||
const doiInfo = await this.getDoiInfo(doiValue);
|
||||
return doiInfo !== null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the last modification date of a DOI
|
||||
*
|
||||
* @param doiValue The DOI identifier
|
||||
* @returns Promise<Date | null> Last modification date or creation date if never updated, null if not found
|
||||
*/
|
||||
public async getDoiLastModified(doiValue: string): Promise<Date | null> {
|
||||
const doiInfo = await this.getDoiInfo(doiValue);
|
||||
|
||||
if (doiInfo) {
|
||||
// Use updated date if available, otherwise fall back to created/registered date
|
||||
const dateToUse = doiInfo.updated || doiInfo.registered || doiInfo.created;
|
||||
|
||||
if (dateToUse) {
|
||||
logger.debug(
|
||||
`DOI ${doiValue}: Using ${doiInfo.updated ? 'updated' : doiInfo.registered ? 'registered' : 'created'} date: ${dateToUse}`,
|
||||
);
|
||||
return new Date(dateToUse);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes a DOI unfindable (registered but not discoverable)
|
||||
* Note: DOIs cannot be deleted, only made unfindable
|
||||
* await doiClient.makeDoiUnfindable('10.21388/tethys.231');
|
||||
*
|
||||
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
|
||||
* @returns Promise<AxiosResponse<any>> The http response
|
||||
*/
|
||||
public async makeDoiUnfindable(doiValue: string): Promise<AxiosResponse<any>> {
|
||||
const auth = {
|
||||
username: this.username,
|
||||
password: this.password,
|
||||
};
|
||||
|
||||
try {
|
||||
// First, check if DOI exists
|
||||
const exists = await this.doiExists(doiValue);
|
||||
if (!exists) {
|
||||
throw new DoiClientException(404, `DOI ${doiValue} not found`);
|
||||
}
|
||||
|
||||
// Delete the DOI URL mapping to make it unfindable
|
||||
// This removes the URL but keeps the metadata registered
|
||||
const response = await axios.delete(`${this.serviceUrl}/doi/${doiValue}`, { auth });
|
||||
|
||||
// Response Codes for DELETE /doi/{doi}
|
||||
// 200 OK: operation successful
|
||||
// 401 Unauthorized: no login
|
||||
// 403 Forbidden: login problem, quota exceeded
|
||||
// 404 Not Found: DOI does not exist
|
||||
if (response.status !== 200) {
|
||||
const message = `Unexpected DataCite MDS response code ${response.status}`;
|
||||
logger.error(message);
|
||||
throw new DoiClientException(response.status, message);
|
||||
}
|
||||
|
||||
logger.info(`DOI ${doiValue} successfully made unfindable`);
|
||||
return response;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to make DOI ${doiValue} unfindable: ${error.message}`);
|
||||
if (error instanceof DoiClientException) {
|
||||
throw error;
|
||||
}
|
||||
throw new DoiClientException(error.response?.status || 500, error.response?.data || error.message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes a DOI findable again by re-registering the URL
|
||||
* await doiClient.makeDoiFindable(
|
||||
* '10.21388/tethys.231',
|
||||
* 'https://doi.dev.tethys.at/10.21388/tethys.231'
|
||||
* );
|
||||
*
|
||||
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
|
||||
* @param landingPageUrl The landing page URL
|
||||
* @returns Promise<AxiosResponse<any>> The http response
|
||||
*/
|
||||
public async makeDoiFindable(doiValue: string, landingPageUrl: string): Promise<AxiosResponse<any>> {
|
||||
const auth = {
|
||||
username: this.username,
|
||||
password: this.password,
|
||||
};
|
||||
|
||||
try {
|
||||
// Re-register the DOI with its URL to make it findable again
|
||||
const response = await axios.put(`${this.serviceUrl}/doi/${doiValue}`, `doi=${doiValue}\nurl=${landingPageUrl}`, { auth });
|
||||
|
||||
// Response Codes for PUT /doi/{doi}
|
||||
// 201 Created: operation successful
|
||||
// 400 Bad Request: request body must be exactly two lines: DOI and URL
|
||||
// 401 Unauthorized: no login
|
||||
// 403 Forbidden: login problem, quota exceeded
|
||||
// 412 Precondition failed: metadata must be uploaded first
|
||||
if (response.status !== 201) {
|
||||
const message = `Unexpected DataCite MDS response code ${response.status}`;
|
||||
logger.error(message);
|
||||
throw new DoiClientException(response.status, message);
|
||||
}
|
||||
|
||||
logger.info(`DOI ${doiValue} successfully made findable again`);
|
||||
return response;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to make DOI ${doiValue} findable: ${error.message}`);
|
||||
if (error instanceof DoiClientException) {
|
||||
throw error;
|
||||
}
|
||||
throw new DoiClientException(error.response?.status || 500, error.response?.data || error.message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current state of a DOI (draft, registered, findable)
|
||||
* const state = await doiClient.getDoiState('10.21388/tethys.231');
|
||||
* console.log(`Current state: ${state}`); // 'findable'
|
||||
*
|
||||
* @param doiValue The DOI identifier
|
||||
* @returns Promise<string | null> The DOI state or null if not found
|
||||
*/
|
||||
public async getDoiState(doiValue: string): Promise<string | null> {
|
||||
const doiInfo = await this.getDoiInfo(doiValue);
|
||||
return doiInfo?.state || null;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
380
commands/fix_dataset_cross_references.ts
Normal file
380
commands/fix_dataset_cross_references.ts
Normal file
|
|
@ -0,0 +1,380 @@
|
|||
/*
|
||||
|--------------------------------------------------------------------------
|
||||
| node ace make:command fix-dataset-cross-references
|
||||
| DONE: create commands/fix_dataset_cross_references.ts
|
||||
|--------------------------------------------------------------------------
|
||||
*/
|
||||
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
||||
import type { CommandOptions } from '@adonisjs/core/types/ace';
|
||||
import { DateTime } from 'luxon';
|
||||
import Dataset from '#models/dataset';
|
||||
import DatasetReference from '#models/dataset_reference';
|
||||
// import env from '#start/env';
|
||||
|
||||
interface MissingCrossReference {
|
||||
sourceDatasetId: number;
|
||||
targetDatasetId: number;
|
||||
sourcePublishId: number | null;
|
||||
targetPublishId: number | null;
|
||||
sourceDoi: string | null;
|
||||
targetDoi: string | null;
|
||||
referenceType: string;
|
||||
relation: string;
|
||||
doi: string | null;
|
||||
reverseRelation: string;
|
||||
}
|
||||
|
||||
export default class DetectMissingCrossReferences extends BaseCommand {
|
||||
static commandName = 'detect:missing-cross-references';
|
||||
static description = 'Detect missing bidirectional cross-references between versioned datasets';
|
||||
|
||||
public static needsApplication = true;
|
||||
|
||||
@flags.boolean({ alias: 'f', description: 'Fix missing cross-references automatically' })
|
||||
public fix: boolean = false;
|
||||
|
||||
@flags.boolean({ alias: 'v', description: 'Verbose output' })
|
||||
public verbose: boolean = false;
|
||||
|
||||
@flags.number({ alias: 'p', description: 'Filter by specific publish_id (source or target dataset)' })
|
||||
public publish_id?: number;
|
||||
|
||||
// example: node ace detect:missing-cross-references --verbose -p 227 //if you want to filter by specific publish_id with details
|
||||
// example: node ace detect:missing-cross-references --verbose
|
||||
// example: node ace detect:missing-cross-references --fix -p 227 //if you want to filter by specific publish_id and fix it
|
||||
// example: node ace detect:missing-cross-references
|
||||
|
||||
public static options: CommandOptions = {
|
||||
startApp: true,
|
||||
staysAlive: false,
|
||||
};
|
||||
|
||||
// Define the allowed relations that we want to process
|
||||
private readonly ALLOWED_RELATIONS = ['IsNewVersionOf', 'IsPreviousVersionOf', 'IsVariantFormOf', 'IsOriginalFormOf'];
|
||||
|
||||
async run() {
|
||||
this.logger.info('🔍 Detecting missing cross-references...');
|
||||
this.logger.info(`📋 Processing only these relations: ${this.ALLOWED_RELATIONS.join(', ')}`);
|
||||
|
||||
if (this.publish_id) {
|
||||
this.logger.info(`Filtering by publish_id: ${this.publish_id}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const missingReferences = await this.findMissingCrossReferences();
|
||||
|
||||
if (missingReferences.length === 0) {
|
||||
const filterMsg = this.publish_id ? ` for publish_id ${this.publish_id}` : '';
|
||||
this.logger.success(`All cross-references are properly linked for the specified relations${filterMsg}!`);
|
||||
return;
|
||||
}
|
||||
|
||||
const filterMsg = this.publish_id ? ` (filtered by publish_id ${this.publish_id})` : '';
|
||||
this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s)${filterMsg}:`);
|
||||
|
||||
// Show brief list if not verbose mode
|
||||
if (!this.verbose) {
|
||||
for (const missing of missingReferences) {
|
||||
const sourceDoi = missing.sourceDoi ? ` DOI: ${missing.sourceDoi}` : '';
|
||||
const targetDoi = missing.targetDoi ? ` DOI: ${missing.targetDoi}` : '';
|
||||
|
||||
this.logger.info(
|
||||
`Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}${sourceDoi}) ${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId}${targetDoi}) → missing reverse: ${missing.reverseRelation}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Verbose mode - show detailed info
|
||||
for (const missing of missingReferences) {
|
||||
this.logger.info(
|
||||
`Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
|
||||
);
|
||||
this.logger.info(` - Reference type: ${missing.referenceType}`);
|
||||
this.logger.info(` - Relation: ${missing.relation}`);
|
||||
this.logger.info(` - DOI: ${missing.doi}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (this.fix) {
|
||||
await this.fixMissingReferences(missingReferences);
|
||||
this.logger.success('All missing cross-references have been fixed!');
|
||||
} else {
|
||||
if (this.verbose) {
|
||||
this.printMissingReferencesList(missingReferences);
|
||||
}
|
||||
this.logger.info('💡 Run with --fix flag to automatically create missing cross-references');
|
||||
if (this.publish_id) {
|
||||
this.logger.info(`🎯 Currently filtering by publish_id: ${this.publish_id}`);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error('Error detecting missing cross-references:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
private async findMissingCrossReferences(): Promise<MissingCrossReference[]> {
|
||||
const missingReferences: {
|
||||
sourceDatasetId: number;
|
||||
targetDatasetId: number;
|
||||
sourcePublishId: number | null;
|
||||
targetPublishId: number | null;
|
||||
sourceDoi: string | null;
|
||||
targetDoi: string | null;
|
||||
referenceType: string;
|
||||
relation: string;
|
||||
doi: string | null;
|
||||
reverseRelation: string;
|
||||
}[] = [];
|
||||
|
||||
this.logger.info('📊 Querying dataset references...');
|
||||
|
||||
// Find all references that point to Tethys datasets (DOI or URL containing tethys DOI)
|
||||
// Only from datasets that are published AND only for allowed relations
|
||||
const tethysReferencesQuery = DatasetReference.query()
|
||||
.whereIn('type', ['DOI', 'URL'])
|
||||
.whereIn('relation', this.ALLOWED_RELATIONS) // Only process allowed relations
|
||||
.where((query) => {
|
||||
query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
|
||||
})
|
||||
.preload('dataset', (datasetQuery) => {
|
||||
datasetQuery.preload('identifier');
|
||||
})
|
||||
.whereHas('dataset', (datasetQuery) => {
|
||||
datasetQuery.where('server_state', 'published');
|
||||
});
|
||||
if (typeof this.publish_id === 'number') {
|
||||
tethysReferencesQuery.whereHas('dataset', (datasetQuery) => {
|
||||
datasetQuery.where('publish_id', this.publish_id as number);
|
||||
});
|
||||
}
|
||||
|
||||
const tethysReferences = await tethysReferencesQuery.exec();
|
||||
|
||||
this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets (allowed relations only)`);
|
||||
|
||||
let processedCount = 0;
|
||||
let skippedCount = 0;
|
||||
|
||||
for (const reference of tethysReferences) {
|
||||
processedCount++;
|
||||
|
||||
if (this.verbose && processedCount % 10 === 0) {
|
||||
this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`);
|
||||
}
|
||||
|
||||
// Double-check that this relation is in our allowed list (safety check)
|
||||
if (!this.ALLOWED_RELATIONS.includes(reference.relation)) {
|
||||
skippedCount++;
|
||||
if (this.verbose) {
|
||||
this.logger.info(`⏭️ Skipping relation "${reference.relation}" - not in allowed list`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract dataset publish_id from DOI or URL
|
||||
const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value);
|
||||
|
||||
if (!targetDatasetPublish) {
|
||||
if (this.verbose) {
|
||||
this.logger.warning(`⚠️ Could not extract publish ID from: ${reference.value}`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if target dataset exists and is published
|
||||
const targetDataset = await Dataset.query()
|
||||
.where('publish_id', targetDatasetPublish)
|
||||
.where('server_state', 'published')
|
||||
.preload('identifier')
|
||||
.first();
|
||||
|
||||
if (!targetDataset) {
|
||||
if (this.verbose) {
|
||||
this.logger.warning(`⚠️ Target dataset with publish_id ${targetDatasetPublish} not found or not published`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ensure we have a valid source dataset with proper preloading
|
||||
if (!reference.dataset) {
|
||||
this.logger.warning(`⚠️ Source dataset ${reference.document_id} not properly loaded, skipping...`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if reverse reference exists
|
||||
const reverseReferenceExists = await this.checkReverseReferenceExists(
|
||||
targetDataset.id,
|
||||
// reference.document_id,
|
||||
reference.relation,
|
||||
);
|
||||
|
||||
if (!reverseReferenceExists) {
|
||||
const reverseRelation = this.getReverseRelation(reference.relation);
|
||||
if (reverseRelation) {
|
||||
// Only add if we have a valid reverse relation
|
||||
missingReferences.push({
|
||||
sourceDatasetId: reference.document_id,
|
||||
targetDatasetId: targetDataset.id,
|
||||
sourcePublishId: reference.dataset.publish_id || null,
|
||||
targetPublishId: targetDataset.publish_id || null,
|
||||
referenceType: reference.type,
|
||||
relation: reference.relation,
|
||||
doi: reference.value,
|
||||
reverseRelation: reverseRelation,
|
||||
sourceDoi: reference.dataset.identifier ? reference.dataset.identifier.value : null,
|
||||
targetDoi: targetDataset.identifier ? targetDataset.identifier.value : null,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.info(`✅ Processed ${processedCount} references (${skippedCount} skipped due to relation filtering)`);
|
||||
return missingReferences;
|
||||
}
|
||||
|
||||
private extractDatasetPublishIdFromReference(value: string): number | null {
|
||||
// Extract from DOI: https://doi.org/10.24341/tethys.107 -> 107
|
||||
const doiMatch = value.match(/10\.24341\/tethys\.(\d+)/);
|
||||
if (doiMatch) {
|
||||
return parseInt(doiMatch[1]);
|
||||
}
|
||||
|
||||
// Extract from URL: https://tethys.at/dataset/107 -> 107
|
||||
const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/);
|
||||
if (urlMatch) {
|
||||
return parseInt(urlMatch[1]);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async checkReverseReferenceExists(targetDatasetId: number, originalRelation: string): Promise<boolean> {
|
||||
const reverseRelation = this.getReverseRelation(originalRelation);
|
||||
|
||||
if (!reverseRelation) {
|
||||
return true; // If no reverse relation is defined, consider it as "exists" to skip processing
|
||||
}
|
||||
|
||||
// Only check for reverse references where the source dataset is also published
|
||||
const reverseReference = await DatasetReference.query()
|
||||
// We don't filter by source document_id here to find any incoming reference from any published dataset
|
||||
// .where('document_id', sourceDatasetId)
|
||||
.where('related_document_id', targetDatasetId)
|
||||
.where('relation', reverseRelation)
|
||||
.first();
|
||||
|
||||
return !!reverseReference;
|
||||
}
|
||||
|
||||
private getReverseRelation(relation: string): string | null {
|
||||
const relationMap: Record<string, string> = {
|
||||
IsNewVersionOf: 'IsPreviousVersionOf',
|
||||
IsPreviousVersionOf: 'IsNewVersionOf',
|
||||
IsVariantFormOf: 'IsOriginalFormOf',
|
||||
IsOriginalFormOf: 'IsVariantFormOf',
|
||||
};
|
||||
|
||||
// Only return reverse relation if it exists in our map, otherwise return null
|
||||
return relationMap[relation] || null;
|
||||
}
|
||||
|
||||
private printMissingReferencesList(missingReferences: MissingCrossReference[]) {
|
||||
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
|
||||
console.log('│ MISSING CROSS-REFERENCES REPORT │');
|
||||
console.log('│ (Published Datasets Only - Filtered Relations) │');
|
||||
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
|
||||
console.log();
|
||||
|
||||
missingReferences.forEach((missing, index) => {
|
||||
console.log(
|
||||
`${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId} Identifier: ${missing.sourceDoi})
|
||||
${missing.relation} Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId} Identifier: ${missing.targetDoi})`,
|
||||
);
|
||||
console.log(` ├─ Current relation: "${missing.relation}"`);
|
||||
console.log(` ├─ Missing reverse relation: "${missing.reverseRelation}"`);
|
||||
console.log(` ├─ Reference type: ${missing.referenceType}`);
|
||||
console.log(` └─ DOI/URL: ${missing.doi}`);
|
||||
console.log();
|
||||
});
|
||||
|
||||
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
|
||||
console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected │`);
|
||||
console.log(`│ Processed relations: ${this.ALLOWED_RELATIONS.join(', ')} │`);
|
||||
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
|
||||
}
|
||||
|
||||
private async fixMissingReferences(missingReferences: MissingCrossReference[]) {
|
||||
this.logger.info('🔧 Creating missing cross-references in database...');
|
||||
|
||||
let fixedCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
for (const [index, missing] of missingReferences.entries()) {
|
||||
try {
|
||||
// Get both source and target datasets
|
||||
const sourceDataset = await Dataset.query()
|
||||
.where('id', missing.sourceDatasetId)
|
||||
.where('server_state', 'published')
|
||||
.preload('identifier')
|
||||
.first();
|
||||
|
||||
const targetDataset = await Dataset.query().where('id', missing.targetDatasetId).where('server_state', 'published').first();
|
||||
|
||||
if (!sourceDataset) {
|
||||
this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`);
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!targetDataset) {
|
||||
this.logger.warning(`⚠️ Target dataset ${missing.targetDatasetId} not found or not published, skipping...`);
|
||||
errorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create the reverse reference using the referenced_by relationship
|
||||
// Example: If Dataset 297 IsNewVersionOf Dataset 144
|
||||
// We create an incoming reference for Dataset 144 that shows Dataset 297 IsPreviousVersionOf it
|
||||
const reverseReference = new DatasetReference();
|
||||
// Don't set document_id - this creates an incoming reference via related_document_id
|
||||
reverseReference.related_document_id = missing.targetDatasetId; // 144 (dataset receiving the incoming reference)
|
||||
reverseReference.type = 'DOI';
|
||||
reverseReference.relation = missing.reverseRelation;
|
||||
|
||||
// Use the source dataset's DOI for the value (what's being referenced)
|
||||
if (sourceDataset.identifier?.value) {
|
||||
reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`;
|
||||
} else {
|
||||
// Fallback to dataset URL if no DOI
|
||||
reverseReference.value = `https://tethys.at/dataset/${sourceDataset.publish_id || missing.sourceDatasetId}`;
|
||||
}
|
||||
|
||||
// Use the source dataset's main title for the label
|
||||
reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`;
|
||||
|
||||
// Also save 'server_date_modified' on target dataset to trigger any downstream updates (e.g. search index)
|
||||
targetDataset.server_date_modified = DateTime.now();
|
||||
await targetDataset.save();
|
||||
|
||||
await reverseReference.save();
|
||||
fixedCount++;
|
||||
|
||||
if (this.verbose) {
|
||||
this.logger.info(
|
||||
`✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.sourceDatasetId} -> ${missing.targetDatasetId} (${missing.reverseRelation})`,
|
||||
);
|
||||
} else if ((index + 1) % 10 === 0) {
|
||||
this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`);
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
`❌ Error creating reverse reference for datasets ${missing.targetDatasetId} -> ${missing.sourceDatasetId}:`,
|
||||
error,
|
||||
);
|
||||
errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.info(`📊 Fix completed: ${fixedCount} created, ${errorCount} errors`);
|
||||
}
|
||||
}
|
||||
346
commands/list_updatable_datacite.ts
Normal file
346
commands/list_updatable_datacite.ts
Normal file
|
|
@ -0,0 +1,346 @@
|
|||
/*
|
||||
|--------------------------------------------------------------------------
|
||||
| node ace make:command list-updateable-datacite
|
||||
| DONE: create commands/list_updeatable_datacite.ts
|
||||
|--------------------------------------------------------------------------
|
||||
*/
|
||||
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
||||
import { CommandOptions } from '@adonisjs/core/types/ace';
|
||||
import Dataset from '#models/dataset';
|
||||
import { DoiClient } from '#app/Library/Doi/DoiClient';
|
||||
import env from '#start/env';
|
||||
import logger from '@adonisjs/core/services/logger';
|
||||
import { DateTime } from 'luxon';
|
||||
import pLimit from 'p-limit';
|
||||
|
||||
export default class ListUpdateableDatacite extends BaseCommand {
|
||||
static commandName = 'list:updateable-datacite';
|
||||
static description = 'List all datasets that need DataCite DOI updates';
|
||||
|
||||
public static needsApplication = true;
|
||||
|
||||
// private chunkSize = 100; // Set chunk size for pagination
|
||||
|
||||
@flags.boolean({ alias: 'v', description: 'Verbose output showing detailed information' })
|
||||
public verbose: boolean = false;
|
||||
|
||||
@flags.boolean({ alias: 'c', description: 'Show only count of updatable datasets' })
|
||||
public countOnly: boolean = false;
|
||||
|
||||
@flags.boolean({ alias: 'i', description: 'Show only publish IDs (useful for scripting)' })
|
||||
public idsOnly: boolean = false;
|
||||
|
||||
@flags.number({ description: 'Chunk size for processing datasets (default: 50)' })
|
||||
public chunkSize: number = 50;
|
||||
|
||||
//example: node ace list:updateable-datacite
|
||||
//example: node ace list:updateable-datacite --verbose
|
||||
//example: node ace list:updateable-datacite --count-only
|
||||
//example: node ace list:updateable-datacite --ids-only
|
||||
//example: node ace list:updateable-datacite --chunk-size 50
|
||||
|
||||
public static options: CommandOptions = {
|
||||
startApp: true,
|
||||
stayAlive: false,
|
||||
};
|
||||
|
||||
async run() {
|
||||
const prefix = env.get('DATACITE_PREFIX', '');
|
||||
const base_domain = env.get('BASE_DOMAIN', '');
|
||||
|
||||
if (!prefix || !base_domain) {
|
||||
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
|
||||
return;
|
||||
}
|
||||
|
||||
// Prevent conflicting flags
|
||||
if ((this.verbose && this.countOnly) || (this.verbose && this.idsOnly)) {
|
||||
logger.error('Flags --verbose cannot be combined with --count-only or --ids-only');
|
||||
return;
|
||||
}
|
||||
|
||||
const chunkSize = this.chunkSize || 50;
|
||||
let page = 1;
|
||||
let hasMoreDatasets = true;
|
||||
let totalProcessed = 0;
|
||||
const updatableDatasets: Dataset[] = [];
|
||||
|
||||
if (!this.countOnly && !this.idsOnly) {
|
||||
logger.info(`Processing datasets in chunks of ${chunkSize}...`);
|
||||
}
|
||||
|
||||
while (hasMoreDatasets) {
|
||||
const datasets = await this.getDatasets(page, chunkSize);
|
||||
|
||||
if (datasets.length === 0) {
|
||||
hasMoreDatasets = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!this.countOnly && !this.idsOnly) {
|
||||
logger.info(`Processing chunk ${page} (${datasets.length} datasets)...`);
|
||||
}
|
||||
|
||||
const chunkUpdatableDatasets = await this.processChunk(datasets);
|
||||
updatableDatasets.push(...chunkUpdatableDatasets);
|
||||
totalProcessed += datasets.length;
|
||||
|
||||
page += 1;
|
||||
if (datasets.length < chunkSize) {
|
||||
hasMoreDatasets = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.countOnly && !this.idsOnly) {
|
||||
logger.info(`Processed ${totalProcessed} datasets total, found ${updatableDatasets.length} that need updates`);
|
||||
}
|
||||
|
||||
if (this.countOnly) {
|
||||
console.log(updatableDatasets.length);
|
||||
} else if (this.idsOnly) {
|
||||
updatableDatasets.forEach((dataset) => console.log(dataset.publish_id));
|
||||
} else if (this.verbose) {
|
||||
await this.showVerboseOutput(updatableDatasets);
|
||||
} else {
|
||||
this.showSimpleOutput(updatableDatasets);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a chunk of datasets to determine which ones need DataCite updates
|
||||
*
|
||||
* This method handles parallel processing of datasets within a chunk, providing
|
||||
* efficient error handling and filtering of results.
|
||||
*
|
||||
* @param datasets - Array of Dataset objects to process
|
||||
* @returns Promise<Dataset[]> - Array of datasets that need updates
|
||||
*/
|
||||
// private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
|
||||
// // Process datasets in parallel using Promise.allSettled for better error handling
|
||||
// //
|
||||
// // Why Promise.allSettled vs Promise.all?
|
||||
// // - Promise.all fails fast: if ANY promise rejects, the entire operation fails
|
||||
// // - Promise.allSettled waits for ALL promises: some can fail, others succeed
|
||||
// // - This is crucial for batch processing where we don't want one bad dataset
|
||||
// // to stop processing of the entire chunk
|
||||
// const results = await Promise.allSettled(
|
||||
// datasets.map(async (dataset) => {
|
||||
// try {
|
||||
// // Check if this specific dataset needs a DataCite update
|
||||
// const needsUpdate = await this.shouldUpdateDataset(dataset);
|
||||
|
||||
// // Return the dataset if it needs update, null if it doesn't
|
||||
// // This creates a sparse array that we'll filter later
|
||||
// return needsUpdate ? dataset : null;
|
||||
// } catch (error) {
|
||||
// // Error handling for individual dataset checks
|
||||
// //
|
||||
// // Log warnings only if we're not in silent modes (count-only or ids-only)
|
||||
// // This prevents log spam when running automated scripts
|
||||
// if (!this.countOnly && !this.idsOnly) {
|
||||
// logger.warn(`Error checking dataset ${dataset.publish_id}: ${error.message}`);
|
||||
// }
|
||||
|
||||
// // IMPORTANT DECISION: Return the dataset anyway if we can't determine status
|
||||
// //
|
||||
// // Why? It's safer to include a dataset that might not need updating
|
||||
// // than to miss one that actually does need updating. This follows the
|
||||
// // "fail-safe" principle - if we're unsure, err on the side of caution
|
||||
// return dataset;
|
||||
// }
|
||||
// }),
|
||||
// );
|
||||
|
||||
// // Filter and extract results from Promise.allSettled response
|
||||
// //
|
||||
// // Promise.allSettled returns an array of objects with this structure:
|
||||
// // - { status: 'fulfilled', value: T } for successful promises
|
||||
// // - { status: 'rejected', reason: Error } for failed promises
|
||||
// //
|
||||
// // We need to:
|
||||
// // 1. Only get fulfilled results (rejected ones are already handled above)
|
||||
// // 2. Filter out null values (datasets that don't need updates)
|
||||
// // 3. Extract the actual Dataset objects from the wrapper
|
||||
// return results
|
||||
// .filter(
|
||||
// (result): result is PromiseFulfilledResult<Dataset | null> =>
|
||||
// // Type guard: only include fulfilled results that have actual values
|
||||
// // This filters out:
|
||||
// // - Rejected promises (shouldn't happen due to try/catch, but safety first)
|
||||
// // - Fulfilled promises that returned null (datasets that don't need updates)
|
||||
// result.status === 'fulfilled' && result.value !== null,
|
||||
// )
|
||||
// .map((result) => result.value!); // Extract the Dataset from the wrapper
|
||||
// // The ! is safe here because we filtered out null values above
|
||||
// }
|
||||
|
||||
private async processChunk(datasets: Dataset[]): Promise<Dataset[]> {
|
||||
// Limit concurrency to avoid API flooding (e.g., max 5 at once)
|
||||
const limit = pLimit(5);
|
||||
|
||||
const tasks = datasets.map((dataset) =>
|
||||
limit(async () => {
|
||||
try {
|
||||
const needsUpdate = await this.shouldUpdateDataset(dataset);
|
||||
return needsUpdate ? dataset : null;
|
||||
} catch (error) {
|
||||
if (!this.countOnly && !this.idsOnly) {
|
||||
logger.warn(
|
||||
`Error checking dataset ${dataset.publish_id}: ${
|
||||
error instanceof Error ? error.message : JSON.stringify(error)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
// Fail-safe: include dataset if uncertain
|
||||
return dataset;
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
const results = await Promise.allSettled(tasks);
|
||||
|
||||
return results
|
||||
.filter((result): result is PromiseFulfilledResult<Dataset | null> => result.status === 'fulfilled' && result.value !== null)
|
||||
.map((result) => result.value!);
|
||||
}
|
||||
|
||||
private async getDatasets(page: number, chunkSize: number): Promise<Dataset[]> {
|
||||
return await Dataset.query()
|
||||
.orderBy('publish_id', 'asc')
|
||||
.preload('identifier')
|
||||
.preload('xmlCache')
|
||||
.preload('titles')
|
||||
.where('server_state', 'published')
|
||||
.whereHas('identifier', (identifierQuery) => {
|
||||
identifierQuery.where('type', 'doi');
|
||||
})
|
||||
.forPage(page, chunkSize); // Get files for the current page
|
||||
}
|
||||
|
||||
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
|
||||
try {
|
||||
let doiIdentifier = dataset.identifier;
|
||||
if (!doiIdentifier) {
|
||||
await dataset.load('identifier');
|
||||
doiIdentifier = dataset.identifier;
|
||||
}
|
||||
|
||||
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const datasetModified =
|
||||
dataset.server_date_modified instanceof DateTime
|
||||
? dataset.server_date_modified
|
||||
: DateTime.fromJSDate(dataset.server_date_modified);
|
||||
|
||||
if (!datasetModified) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (datasetModified > DateTime.now()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const doiClient = new DoiClient();
|
||||
const DOI_CHECK_TIMEOUT = 300; // ms
|
||||
|
||||
const doiLastModified = await Promise.race([
|
||||
doiClient.getDoiLastModified(doiIdentifier.value),
|
||||
this.createTimeoutPromise(DOI_CHECK_TIMEOUT),
|
||||
]).catch(() => null);
|
||||
|
||||
if (!doiLastModified) {
|
||||
// If uncertain, better include dataset for update
|
||||
return true;
|
||||
}
|
||||
|
||||
const doiModified = DateTime.fromJSDate(doiLastModified);
|
||||
if (datasetModified > doiModified) {
|
||||
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
|
||||
const toleranceSeconds = 600;
|
||||
return diffInSeconds > toleranceSeconds;
|
||||
}
|
||||
return false;
|
||||
} catch (error) {
|
||||
return true; // safer: include dataset if unsure
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a timeout promise for API calls
|
||||
*/
|
||||
private createTimeoutPromise(timeoutMs: number): Promise<never> {
|
||||
return new Promise((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`API call timeout after ${timeoutMs}ms`)), timeoutMs);
|
||||
});
|
||||
}
|
||||
|
||||
private showSimpleOutput(updatableDatasets: Dataset[]): void {
|
||||
if (updatableDatasets.length === 0) {
|
||||
console.log('No datasets need DataCite updates.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
|
||||
|
||||
updatableDatasets.forEach((dataset) => {
|
||||
console.log(`publish_id ${dataset.publish_id} needs update - ${dataset.mainTitle || 'Untitled'}`);
|
||||
});
|
||||
|
||||
console.log(`\nTo update these datasets, run:`);
|
||||
console.log(` node ace update:datacite`);
|
||||
console.log(`\nOr update specific datasets:`);
|
||||
console.log(` node ace update:datacite -p <publish_id>`);
|
||||
}
|
||||
|
||||
private async showVerboseOutput(updatableDatasets: Dataset[]): Promise<void> {
|
||||
if (updatableDatasets.length === 0) {
|
||||
console.log('No datasets need DataCite updates.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\nFound ${updatableDatasets.length} dataset(s) that need DataCite updates:\n`);
|
||||
|
||||
for (const dataset of updatableDatasets) {
|
||||
await this.showDatasetDetails(dataset);
|
||||
}
|
||||
|
||||
console.log(`\nSummary: ${updatableDatasets.length} datasets need updates`);
|
||||
}
|
||||
|
||||
private async showDatasetDetails(dataset: Dataset): Promise<void> {
|
||||
try {
|
||||
let doiIdentifier = dataset.identifier;
|
||||
|
||||
if (!doiIdentifier) {
|
||||
await dataset.load('identifier');
|
||||
doiIdentifier = dataset.identifier;
|
||||
}
|
||||
|
||||
const doiValue = doiIdentifier?.value || 'N/A';
|
||||
const datasetModified = dataset.server_date_modified;
|
||||
|
||||
// Get DOI info from DataCite
|
||||
const doiClient = new DoiClient();
|
||||
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
|
||||
const doiState = await doiClient.getDoiState(doiValue);
|
||||
|
||||
console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
|
||||
console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`);
|
||||
console.log(`│ DOI: ${doiValue}`);
|
||||
console.log(`│ DOI State: ${doiState || 'Unknown'}`);
|
||||
console.log(`│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}`);
|
||||
console.log(`│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}`);
|
||||
console.log(`│ Status: NEEDS UPDATE`);
|
||||
console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
|
||||
} catch (error) {
|
||||
console.log(`┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────`);
|
||||
console.log(`│ Title: ${dataset.mainTitle || 'Untitled'}`);
|
||||
console.log(`│ DOI: ${dataset.identifier?.value || 'N/A'}`);
|
||||
console.log(`│ Error: ${error.message}`);
|
||||
console.log(`│ Status: NEEDS UPDATE (Error checking)`);
|
||||
console.log(`└─────────────────────────────────────────────────────────────────────────────────────────────\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
266
commands/update_datacite.ts
Normal file
266
commands/update_datacite.ts
Normal file
|
|
@ -0,0 +1,266 @@
|
|||
/*
|
||||
|--------------------------------------------------------------------------
|
||||
| node ace make:command update-datacite
|
||||
| DONE: create commands/update_datacite.ts
|
||||
|--------------------------------------------------------------------------
|
||||
*/
|
||||
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
||||
import { CommandOptions } from '@adonisjs/core/types/ace';
|
||||
import Dataset from '#models/dataset';
|
||||
import { DoiClient } from '#app/Library/Doi/DoiClient';
|
||||
import DoiClientException from '#app/exceptions/DoiClientException';
|
||||
import Index from '#app/Library/Utils/Index';
|
||||
import env from '#start/env';
|
||||
import logger from '@adonisjs/core/services/logger';
|
||||
import { DateTime } from 'luxon';
|
||||
import { getDomain } from '#app/utils/utility-functions';
|
||||
|
||||
export default class UpdateDatacite extends BaseCommand {
|
||||
static commandName = 'update:datacite';
|
||||
static description = 'Update DataCite DOI records for published datasets';
|
||||
|
||||
public static needsApplication = true;
|
||||
|
||||
@flags.number({ alias: 'p', description: 'Specific publish_id to update' })
|
||||
public publish_id: number;
|
||||
|
||||
@flags.boolean({ alias: 'f', description: 'Force update all records regardless of modification date' })
|
||||
public force: boolean = false;
|
||||
|
||||
@flags.boolean({ alias: 'd', description: 'Dry run - show what would be updated without making changes' })
|
||||
public dryRun: boolean = false;
|
||||
|
||||
@flags.boolean({ alias: 's', description: 'Show detailed stats for each dataset that needs updating' })
|
||||
public stats: boolean = false;
|
||||
|
||||
//example: node ace update:datacite -p 123 --force --dry-run
|
||||
|
||||
public static options: CommandOptions = {
|
||||
startApp: true, // Whether to boot the application before running the command
|
||||
stayAlive: false, // Whether to keep the process alive after the command has executed
|
||||
};
|
||||
|
||||
async run() {
|
||||
logger.info('Starting DataCite update process...');
|
||||
|
||||
const prefix = env.get('DATACITE_PREFIX', '');
|
||||
const base_domain = env.get('BASE_DOMAIN', '');
|
||||
const apiUrl = env.get('DATACITE_API_URL', 'https://api.datacite.org');
|
||||
|
||||
if (!prefix || !base_domain) {
|
||||
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info(`Using DataCite API: ${apiUrl}`);
|
||||
|
||||
const datasets = await this.getDatasets();
|
||||
logger.info(`Found ${datasets.length} datasets to process`);
|
||||
|
||||
let updated = 0;
|
||||
let skipped = 0;
|
||||
let errors = 0;
|
||||
|
||||
for (const dataset of datasets) {
|
||||
try {
|
||||
const shouldUpdate = this.force || (await this.shouldUpdateDataset(dataset));
|
||||
|
||||
if (this.stats) {
|
||||
// Stats mode: show detailed information for datasets that need updating
|
||||
if (shouldUpdate) {
|
||||
await this.showDatasetStats(dataset);
|
||||
updated++;
|
||||
} else {
|
||||
skipped++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!shouldUpdate) {
|
||||
logger.info(`Dataset ${dataset.publish_id}: Up to date, skipping`);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.dryRun) {
|
||||
logger.info(`Dataset ${dataset.publish_id}: Would update DataCite record (dry run)`);
|
||||
updated++;
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.updateDataciteRecord(dataset, prefix, base_domain);
|
||||
logger.info(`Dataset ${dataset.publish_id}: Successfully updated DataCite record`);
|
||||
updated++;
|
||||
} catch (error) {
|
||||
logger.error(`Dataset ${dataset.publish_id}: Failed to update - ${error.message}`);
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.stats) {
|
||||
logger.info(`\nDataCite Stats Summary: ${updated} datasets need updating, ${skipped} are up to date`);
|
||||
} else {
|
||||
logger.info(`DataCite update completed. Updated: ${updated}, Skipped: ${skipped}, Errors: ${errors}`);
|
||||
}
|
||||
}
|
||||
|
||||
private async getDatasets(): Promise<Dataset[]> {
|
||||
const query = Dataset.query()
|
||||
.preload('identifier')
|
||||
.preload('xmlCache')
|
||||
.where('server_state', 'published')
|
||||
.whereHas('identifier', (identifierQuery) => {
|
||||
identifierQuery.where('type', 'doi');
|
||||
});
|
||||
|
||||
if (this.publish_id) {
|
||||
query.where('publish_id', this.publish_id);
|
||||
}
|
||||
|
||||
return await query.exec();
|
||||
}
|
||||
|
||||
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
|
||||
try {
|
||||
let doiIdentifier = dataset.identifier;
|
||||
|
||||
if (!doiIdentifier) {
|
||||
await dataset.load('identifier');
|
||||
doiIdentifier = dataset.identifier;
|
||||
}
|
||||
|
||||
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const datasetModified = dataset.server_date_modified;
|
||||
const now = DateTime.now();
|
||||
|
||||
if (!datasetModified) {
|
||||
return true; // Update if modification date is missing
|
||||
}
|
||||
|
||||
if (datasetModified > now) {
|
||||
return false; // Skip invalid future dates
|
||||
}
|
||||
|
||||
// Check DataCite DOI modification date
|
||||
const doiClient = new DoiClient();
|
||||
const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);
|
||||
|
||||
if (!doiLastModified) {
|
||||
return false; // not Update if we can't get DOI info
|
||||
}
|
||||
|
||||
const doiModified = DateTime.fromJSDate(doiLastModified);
|
||||
if (datasetModified > doiModified) {
|
||||
// if dataset was modified after DOI creation
|
||||
// Calculate the difference in seconds
|
||||
const diffInSeconds = Math.abs(datasetModified.diff(doiModified, 'seconds').seconds);
|
||||
|
||||
// Define tolerance threshold (60 seconds = 1 minute)
|
||||
const toleranceSeconds = 60;
|
||||
|
||||
// Only update if the difference is greater than the tolerance
|
||||
// This prevents unnecessary updates for minor timestamp differences
|
||||
return diffInSeconds > toleranceSeconds;
|
||||
} else {
|
||||
return false; // No update needed
|
||||
}
|
||||
} catch (error) {
|
||||
return false; // not update if we can't determine status or other error
|
||||
}
|
||||
}
|
||||
|
||||
private async updateDataciteRecord(dataset: Dataset, prefix: string, base_domain: string): Promise<void> {
|
||||
try {
|
||||
// Get the DOI identifier (HasOne relationship)
|
||||
let doiIdentifier = dataset.identifier;
|
||||
|
||||
if (!doiIdentifier) {
|
||||
await dataset.load('identifier');
|
||||
doiIdentifier = dataset.identifier;
|
||||
}
|
||||
|
||||
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
||||
throw new Error('No DOI identifier found for dataset');
|
||||
}
|
||||
|
||||
// Generate XML metadata
|
||||
const xmlMeta = (await Index.getDoiRegisterString(dataset)) as string;
|
||||
if (!xmlMeta) {
|
||||
throw new Error('Failed to generate XML metadata');
|
||||
}
|
||||
|
||||
// Construct DOI value and landing page URL
|
||||
const doiValue = doiIdentifier.value; // Use existing DOI value
|
||||
const landingPageUrl = `https://doi.${getDomain(base_domain)}/${doiValue}`;
|
||||
|
||||
// Update DataCite record
|
||||
const doiClient = new DoiClient();
|
||||
const dataciteResponse = await doiClient.registerDoi(doiValue, xmlMeta, landingPageUrl);
|
||||
|
||||
if (dataciteResponse?.status === 201) {
|
||||
// // Update dataset modification date
|
||||
// dataset.server_date_modified = DateTime.now();
|
||||
// await dataset.save();
|
||||
|
||||
// // Update search index
|
||||
// const index_name = 'tethys-records';
|
||||
// await Index.indexDocument(dataset, index_name);
|
||||
|
||||
logger.debug(`Dataset ${dataset.publish_id}: DataCite record and search index updated successfully`);
|
||||
} else {
|
||||
throw new DoiClientException(
|
||||
dataciteResponse?.status || 500,
|
||||
`Unexpected DataCite response code: ${dataciteResponse?.status}`,
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
if (error instanceof DoiClientException) {
|
||||
throw error;
|
||||
}
|
||||
throw new Error(`Failed to update DataCite record: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shows detailed statistics for a dataset that needs updating
|
||||
*/
|
||||
private async showDatasetStats(dataset: Dataset): Promise<void> {
|
||||
try {
|
||||
let doiIdentifier = dataset.identifier;
|
||||
|
||||
if (!doiIdentifier) {
|
||||
await dataset.load('identifier');
|
||||
doiIdentifier = dataset.identifier;
|
||||
}
|
||||
|
||||
const doiValue = doiIdentifier?.value || 'N/A';
|
||||
const doiStatus = doiIdentifier?.status || 'N/A';
|
||||
const datasetModified = dataset.server_date_modified;
|
||||
|
||||
// Get DOI info from DataCite
|
||||
const doiClient = new DoiClient();
|
||||
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
|
||||
const doiState = await doiClient.getDoiState(doiValue);
|
||||
|
||||
console.log(`
|
||||
┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────
|
||||
│ DOI Value: ${doiValue}
|
||||
│ DOI Status (DB): ${doiStatus}
|
||||
│ DOI State (DataCite): ${doiState || 'Unknown'}
|
||||
│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}
|
||||
│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}
|
||||
│ Needs Update: YES - Dataset newer than DOI
|
||||
└─────────────────────────────────────────────────────────────────────────────────────────────`);
|
||||
} catch (error) {
|
||||
console.log(`
|
||||
┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────
|
||||
│ DOI Value: ${dataset.identifier?.value || 'N/A'}
|
||||
│ Error: ${error.message}
|
||||
│ Needs Update: YES - Error checking status
|
||||
└─────────────────────────────────────────────────────────────────────────────────────────────`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,47 +1,61 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# # Run freshclam to update virus definitions
|
||||
# freshclam
|
||||
echo "Starting ClamAV services..."
|
||||
|
||||
# # Sleep for a few seconds to give ClamAV time to start
|
||||
# sleep 5
|
||||
|
||||
# # Start the ClamAV daemon
|
||||
# /etc/init.d/clamav-daemon start
|
||||
|
||||
# bootstrap clam av service and clam av database updater
|
||||
set -m
|
||||
|
||||
function process_file() {
|
||||
if [[ ! -z "$1" ]]; then
|
||||
local SETTING_LIST=$(echo "$1" | tr ',' '\n' | grep "^[A-Za-z][A-Za-z]*=.*$")
|
||||
local SETTING
|
||||
|
||||
for SETTING in ${SETTING_LIST}; do
|
||||
# Remove any existing copies of this setting. We do this here so that
|
||||
# settings with multiple values (e.g. ExtraDatabase) can still be added
|
||||
# multiple times below
|
||||
local KEY=${SETTING%%=*}
|
||||
sed -i $2 -e "/^${KEY} /d"
|
||||
done
|
||||
|
||||
for SETTING in ${SETTING_LIST}; do
|
||||
# Split on first '='
|
||||
local KEY=${SETTING%%=*}
|
||||
local VALUE=${SETTING#*=}
|
||||
echo "${KEY} ${VALUE}" >> "$2"
|
||||
done
|
||||
# Try to download database if missing
|
||||
if [ ! "$(ls -A /var/lib/clamav 2>/dev/null)" ]; then
|
||||
echo "Downloading ClamAV database (this may take a while)..."
|
||||
|
||||
# Simple freshclam run without complex config
|
||||
if sg clamav -c "freshclam --datadir=/var/lib/clamav --quiet"; then
|
||||
echo "✓ Database downloaded successfully"
|
||||
else
|
||||
echo "⚠ Database download failed - creating minimal setup"
|
||||
# Create a dummy file so clamd doesn't immediately fail
|
||||
sg clamav -c "touch /var/lib/clamav/.dummy"
|
||||
fi
|
||||
}
|
||||
fi
|
||||
|
||||
# process_file "${CLAMD_SETTINGS_CSV}" /etc/clamav/clamd.conf
|
||||
# process_file "${FRESHCLAM_SETTINGS_CSV}" /etc/clamav/freshclam.conf
|
||||
# Start freshclam daemon for automatic updates
|
||||
echo "Starting freshclam daemon for automatic updates..."
|
||||
sg clamav -c "freshclam -d" &
|
||||
|
||||
# start in background
|
||||
freshclam -d &
|
||||
# /etc/init.d/clamav-freshclam start &
|
||||
clamd
|
||||
# Start clamd in background
|
||||
# Start clamd in foreground (so dumb-init can supervise it)
|
||||
# /etc/init.d/clamav-daemon start &
|
||||
|
||||
# change back to CMD of dockerfile
|
||||
exec "$@"
|
||||
# Start clamd daemon in background using sg
|
||||
echo "Starting ClamAV daemon..."
|
||||
# sg clamav -c "clamd" &
|
||||
# Use sg to run clamd with proper group permissions
|
||||
# sg clamav -c "clamd" &
|
||||
sg clamav -c "clamd --config-file=/etc/clamav/clamd.conf" &
|
||||
|
||||
|
||||
# Give services time to start
|
||||
echo "Waiting for services to initialize..."
|
||||
sleep 8
|
||||
|
||||
# simple check
|
||||
if pgrep clamd > /dev/null; then
|
||||
echo "✓ ClamAV daemon is running"
|
||||
else
|
||||
echo "⚠ ClamAV daemon status uncertain, but continuing..."
|
||||
fi
|
||||
|
||||
# Check if freshclam daemon is running
|
||||
if pgrep freshclam > /dev/null; then
|
||||
echo "✓ Freshclam daemon is running"
|
||||
else
|
||||
echo "⚠ Freshclam daemon status uncertain, but continuing..."
|
||||
fi
|
||||
|
||||
# # change back to CMD of dockerfile
|
||||
# exec "$@"
|
||||
|
||||
echo "✓ ClamAV setup complete"
|
||||
echo "Starting main application..."
|
||||
exec dumb-init -- "$@"
|
||||
278
docs/commands/index-datasets.md
Normal file
278
docs/commands/index-datasets.md
Normal file
|
|
@ -0,0 +1,278 @@
|
|||
# Dataset Indexing Command
|
||||
|
||||
AdonisJS Ace command for indexing and synchronizing published datasets with OpenSearch for search functionality.
|
||||
|
||||
## Overview
|
||||
|
||||
The `index:datasets` command processes published datasets and creates/updates corresponding search index documents in OpenSearch. It intelligently compares modification timestamps to only re-index datasets when necessary, optimizing performance while maintaining search index accuracy.
|
||||
|
||||
## Command Syntax
|
||||
|
||||
```bash
|
||||
node ace index:datasets [options]
|
||||
```
|
||||
|
||||
## Options
|
||||
|
||||
| Flag | Alias | Description |
|
||||
|------|-------|-------------|
|
||||
| `--publish_id <number>` | `-p` | Index a specific dataset by publish_id |
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Operations
|
||||
|
||||
```bash
|
||||
# Index all published datasets that have been modified since last indexing
|
||||
node ace index:datasets
|
||||
|
||||
# Index a specific dataset by publish_id
|
||||
node ace index:datasets --publish_id 231
|
||||
node ace index:datasets -p 231
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
### 1. **Dataset Selection**
|
||||
The command processes datasets that meet these criteria:
|
||||
- `server_state = 'published'` - Only published datasets
|
||||
- Has preloaded `xmlCache` relationship for metadata transformation
|
||||
- Optionally filtered by specific `publish_id`
|
||||
|
||||
### 2. **Smart Update Detection**
|
||||
For each dataset, the command:
|
||||
- Checks if the dataset exists in the OpenSearch index
|
||||
- Compares `server_date_modified` timestamps
|
||||
- Only re-indexes if the dataset is newer than the indexed version
|
||||
|
||||
### 3. **Document Processing**
|
||||
The indexing process involves:
|
||||
1. **XML Generation**: Creates structured XML from dataset metadata
|
||||
2. **XSLT Transformation**: Converts XML to JSON using Saxon-JS processor
|
||||
3. **Index Update**: Updates or creates the document in OpenSearch
|
||||
4. **Logging**: Records success/failure for each operation
|
||||
|
||||
## Index Structure
|
||||
|
||||
### Index Configuration
|
||||
- **Index Name**: `tethys-records`
|
||||
- **Document ID**: Dataset `publish_id`
|
||||
- **Refresh**: `true` (immediate availability)
|
||||
|
||||
### Document Fields
|
||||
The indexed documents contain:
|
||||
- **Metadata Fields**: Title, description, authors, keywords
|
||||
- **Identifiers**: DOI, publish_id, and other identifiers
|
||||
- **Temporal Data**: Publication dates, coverage periods
|
||||
- **Geographic Data**: Spatial coverage information
|
||||
- **Technical Details**: Data formats, access information
|
||||
- **Timestamps**: Creation and modification dates
|
||||
|
||||
## Example Output
|
||||
|
||||
### Successful Run
|
||||
```bash
|
||||
node ace index:datasets
|
||||
```
|
||||
```
|
||||
Found 150 published datasets to process
|
||||
Dataset with publish_id 231 successfully indexed
|
||||
Dataset with publish_id 245 is up to date, skipping indexing
|
||||
Dataset with publish_id 267 successfully indexed
|
||||
An error occurred while indexing dataset with publish_id 289. Error: Invalid XML metadata
|
||||
Processing completed: 148 indexed, 1 skipped, 1 error
|
||||
```
|
||||
|
||||
### Specific Dataset
|
||||
```bash
|
||||
node ace index:datasets --publish_id 231
|
||||
```
|
||||
```
|
||||
Found 1 published dataset to process
|
||||
Dataset with publish_id 231 successfully indexed
|
||||
Processing completed: 1 indexed, 0 skipped, 0 errors
|
||||
```
|
||||
|
||||
## Update Logic
|
||||
|
||||
The command uses intelligent indexing to avoid unnecessary processing:
|
||||
|
||||
| Condition | Action | Reason |
|
||||
|-----------|--------|--------|
|
||||
| Dataset not in index | ✅ Index | New dataset needs indexing |
|
||||
| Dataset newer than indexed version | ✅ Re-index | Dataset has been updated |
|
||||
| Dataset same/older than indexed version | ❌ Skip | Already up to date |
|
||||
| OpenSearch document check fails | ✅ Index | Better safe than sorry |
|
||||
| Invalid XML metadata | ❌ Skip + Log Error | Cannot process invalid data |
|
||||
|
||||
### Timestamp Comparison
|
||||
```typescript
|
||||
// Example comparison logic
|
||||
const existingModified = DateTime.fromMillis(Number(existingDoc.server_date_modified) * 1000);
|
||||
const currentModified = dataset.server_date_modified;
|
||||
|
||||
if (currentModified <= existingModified) {
|
||||
// Skip - already up to date
|
||||
return false;
|
||||
}
|
||||
// Proceed with indexing
|
||||
```
|
||||
|
||||
## XML Transformation Process
|
||||
|
||||
### 1. **XML Generation**
|
||||
```xml
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="true"?>
|
||||
<root>
|
||||
<Dataset>
|
||||
<!-- Dataset metadata fields -->
|
||||
<title>Research Dataset Title</title>
|
||||
<description>Dataset description...</description>
|
||||
<!-- Additional metadata -->
|
||||
</Dataset>
|
||||
</root>
|
||||
```
|
||||
|
||||
### 2. **XSLT Processing**
|
||||
The command uses Saxon-JS with a compiled stylesheet (`solr.sef.json`) to transform XML to JSON:
|
||||
```javascript
|
||||
const result = await SaxonJS.transform({
|
||||
stylesheetText: proc,
|
||||
destination: 'serialized',
|
||||
sourceText: xmlString,
|
||||
});
|
||||
```
|
||||
|
||||
### 3. **Final JSON Document**
|
||||
```json
|
||||
{
|
||||
"id": "231",
|
||||
"title": "Research Dataset Title",
|
||||
"description": "Dataset description...",
|
||||
"authors": ["Author Name"],
|
||||
"server_date_modified": 1634567890,
|
||||
"publish_id": 231
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration Requirements
|
||||
|
||||
### Environment Variables
|
||||
```bash
|
||||
# OpenSearch Configuration
|
||||
OPENSEARCH_HOST=localhost:9200
|
||||
|
||||
# For production:
|
||||
# OPENSEARCH_HOST=your-opensearch-cluster:9200
|
||||
```
|
||||
|
||||
### Required Files
|
||||
- **XSLT Stylesheet**: `public/assets2/solr.sef.json` - Compiled Saxon-JS stylesheet for XML transformation
|
||||
|
||||
### Database Relationships
|
||||
The command expects these model relationships:
|
||||
```typescript
|
||||
// Dataset model must have:
|
||||
@hasOne(() => XmlCache, { foreignKey: 'dataset_id' })
|
||||
public xmlCache: HasOne<typeof XmlCache>
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The command handles various error scenarios gracefully:
|
||||
|
||||
### Common Errors and Solutions
|
||||
|
||||
| Error | Cause | Solution |
|
||||
|-------|-------|----------|
|
||||
| `XSLT transformation failed` | Invalid XML or missing stylesheet | Check XML structure and stylesheet path |
|
||||
| `OpenSearch connection error` | Service unavailable | Verify OpenSearch is running and accessible |
|
||||
| `JSON parse error` | Malformed transformation result | Check XSLT stylesheet output format |
|
||||
| `Missing xmlCache relationship` | Data integrity issue | Ensure xmlCache exists for dataset |
|
||||
|
||||
### Error Logging
|
||||
```bash
|
||||
# Typical error log entry
|
||||
An error occurred while indexing dataset with publish_id 231.
|
||||
Error: XSLT transformation failed: Invalid XML structure at line 15
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Batch Processing
|
||||
- Processes datasets sequentially to avoid overwhelming OpenSearch
|
||||
- Each dataset is committed individually for reliability
|
||||
- Failed indexing of one dataset doesn't stop processing others
|
||||
|
||||
### Resource Usage
|
||||
- **Memory**: XML/JSON transformations require temporary memory
|
||||
- **Network**: OpenSearch API calls for each dataset
|
||||
- **CPU**: XSLT transformations are CPU-intensive
|
||||
|
||||
### Optimization Tips
|
||||
```bash
|
||||
# Index only recently modified datasets (run regularly)
|
||||
node ace index:datasets
|
||||
|
||||
# Index specific datasets when needed
|
||||
node ace index:datasets --publish_id 231
|
||||
|
||||
# Consider running during off-peak hours for large batches
|
||||
```
|
||||
|
||||
## Integration with Other Systems
|
||||
|
||||
### Search Functionality
|
||||
The indexed documents power:
|
||||
- **Dataset Search**: Full-text search across metadata
|
||||
- **Faceted Browsing**: Filter by authors, keywords, dates
|
||||
- **Geographic Search**: Spatial query capabilities
|
||||
- **Auto-complete**: Suggest dataset titles and keywords
|
||||
|
||||
### Related Commands
|
||||
- [`update:datacite`](update-datacite.md) - Often run after indexing to sync DOI metadata
|
||||
- **Database migrations** - May require re-indexing after schema changes
|
||||
|
||||
### API Integration
|
||||
The indexed data is consumed by:
|
||||
- **Search API**: `/api/search` endpoints
|
||||
- **Browse API**: `/api/datasets` with filtering
|
||||
- **Recommendations**: Related dataset suggestions
|
||||
|
||||
## Monitoring and Maintenance
|
||||
|
||||
### Regular Tasks
|
||||
```bash
|
||||
# Daily indexing (recommended cron job)
|
||||
0 2 * * * cd /path/to/project && node ace index:datasets
|
||||
|
||||
# Weekly full re-index (if needed)
|
||||
0 3 * * 0 cd /path/to/project && node ace index:datasets --force
|
||||
```
|
||||
|
||||
### Health Checks
|
||||
- Monitor OpenSearch cluster health
|
||||
- Check for failed indexing operations in logs
|
||||
- Verify search functionality is working
|
||||
- Compare dataset counts between database and index
|
||||
|
||||
### Troubleshooting
|
||||
```bash
|
||||
# Check specific dataset indexing
|
||||
node ace index:datasets --publish_id 231
|
||||
|
||||
# Verify OpenSearch connectivity
|
||||
curl -X GET "localhost:9200/_cluster/health"
|
||||
|
||||
# Check index statistics
|
||||
curl -X GET "localhost:9200/tethys-records/_stats"
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Regular Scheduling**: Run the command regularly (daily) to keep the search index current
|
||||
2. **Monitor Logs**: Watch for transformation errors or OpenSearch issues
|
||||
3. **Backup Strategy**: Include OpenSearch indices in backup procedures
|
||||
4. **Resource Management**: Monitor OpenSearch cluster resources during bulk operations
|
||||
5. **Testing**: Verify search functionality after major indexing operations
|
||||
6. **Coordination**: Run indexing before DataCite updates when both are needed
|
||||
216
docs/commands/update-datacite.md
Normal file
216
docs/commands/update-datacite.md
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
# DataCite Update Command
|
||||
|
||||
AdonisJS Ace command for updating DataCite DOI records for published datasets.
|
||||
|
||||
## Overview
|
||||
|
||||
The `update:datacite` command synchronizes your local dataset metadata with DataCite DOI records. It intelligently compares modification dates to only update records when necessary, reducing unnecessary API calls and maintaining data consistency.
|
||||
|
||||
## Command Syntax
|
||||
|
||||
```bash
|
||||
node ace update:datacite [options]
|
||||
```
|
||||
|
||||
## Options
|
||||
|
||||
| Flag | Alias | Description |
|
||||
|------|-------|-------------|
|
||||
| `--publish_id <number>` | `-p` | Update a specific dataset by publish_id |
|
||||
| `--force` | `-f` | Force update all records regardless of modification date |
|
||||
| `--dry-run` | `-d` | Preview what would be updated without making changes |
|
||||
| `--stats` | `-s` | Show detailed statistics for datasets that need updating |
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Operations
|
||||
|
||||
```bash
|
||||
# Update all datasets that have been modified since their DOI was last updated
|
||||
node ace update:datacite
|
||||
|
||||
# Update a specific dataset
|
||||
node ace update:datacite --publish_id 231
|
||||
node ace update:datacite -p 231
|
||||
|
||||
# Force update all datasets with DOIs (ignores modification dates)
|
||||
node ace update:datacite --force
|
||||
```
|
||||
|
||||
### Preview and Analysis
|
||||
|
||||
```bash
|
||||
# Preview what would be updated (dry run)
|
||||
node ace update:datacite --dry-run
|
||||
|
||||
# Show detailed statistics for datasets that need updating
|
||||
node ace update:datacite --stats
|
||||
|
||||
# Show stats for a specific dataset
|
||||
node ace update:datacite --stats --publish_id 231
|
||||
```
|
||||
|
||||
### Combined Options
|
||||
|
||||
```bash
|
||||
# Dry run for a specific dataset
|
||||
node ace update:datacite --dry-run --publish_id 231
|
||||
|
||||
# Show stats for all datasets (including up-to-date ones)
|
||||
node ace update:datacite --stats --force
|
||||
```
|
||||
|
||||
## Command Modes
|
||||
|
||||
### 1. **Normal Mode** (Default)
|
||||
Updates DataCite records for datasets that have been modified since their DOI was last updated.
|
||||
|
||||
**Example Output:**
|
||||
```
|
||||
Using DataCite API: https://api.test.datacite.org
|
||||
Found 50 datasets to process
|
||||
Dataset 231: Successfully updated DataCite record
|
||||
Dataset 245: Up to date, skipping
|
||||
Dataset 267: Successfully updated DataCite record
|
||||
DataCite update completed. Updated: 15, Skipped: 35, Errors: 0
|
||||
```
|
||||
|
||||
### 2. **Dry Run Mode** (`--dry-run`)
|
||||
Shows what would be updated without making any changes to DataCite.
|
||||
|
||||
**Use Case:** Preview updates before running the actual command.
|
||||
|
||||
**Example Output:**
|
||||
```
|
||||
Dataset 231: Would update DataCite record (dry run)
|
||||
Dataset 267: Would update DataCite record (dry run)
|
||||
Dataset 245: Up to date, skipping
|
||||
DataCite update completed. Updated: 2, Skipped: 1, Errors: 0
|
||||
```
|
||||
|
||||
### 3. **Stats Mode** (`--stats`)
|
||||
Shows detailed information for each dataset that needs updating, including why it needs updating.
|
||||
|
||||
**Use Case:** Debug synchronization issues, monitor dataset/DOI status, generate reports.
|
||||
|
||||
**Example Output:**
|
||||
```
|
||||
┌─ Dataset 231 ─────────────────────────────────────────────────────────
|
||||
│ DOI Value: 10.21388/tethys.231
|
||||
│ DOI Status (DB): findable
|
||||
│ DOI State (DataCite): findable
|
||||
│ Dataset Modified: 2024-09-15T10:30:00.000Z
|
||||
│ DOI Modified: 2024-09-10T08:15:00.000Z
|
||||
│ Needs Update: YES - Dataset newer than DOI
|
||||
└───────────────────────────────────────────────────────────────────────
|
||||
|
||||
┌─ Dataset 267 ─────────────────────────────────────────────────────────
|
||||
│ DOI Value: 10.21388/tethys.267
|
||||
│ DOI Status (DB): findable
|
||||
│ DOI State (DataCite): findable
|
||||
│ Dataset Modified: 2024-09-18T14:20:00.000Z
|
||||
│ DOI Modified: 2024-09-16T12:45:00.000Z
|
||||
│ Needs Update: YES - Dataset newer than DOI
|
||||
└───────────────────────────────────────────────────────────────────────
|
||||
|
||||
DataCite Stats Summary: 2 datasets need updating, 48 are up to date
|
||||
```
|
||||
|
||||
## Update Logic
|
||||
|
||||
The command uses intelligent update detection:
|
||||
|
||||
1. **Compares modification dates**: Dataset `server_date_modified` vs DOI last modification date from DataCite
|
||||
2. **Validates data integrity**: Checks for missing or future dates
|
||||
3. **Handles API failures gracefully**: Updates anyway if DataCite info can't be retrieved
|
||||
4. **Uses dual API approach**: DataCite REST API (primary) with MDS API fallback
|
||||
|
||||
### When Updates Happen
|
||||
|
||||
| Condition | Action | Reason |
|
||||
|-----------|--------|--------|
|
||||
| Dataset modified > DOI modified | ✅ Update | Dataset has newer changes |
|
||||
| Dataset modified ≤ DOI modified | ❌ Skip | DOI is up to date |
|
||||
| Dataset date in future | ❌ Skip | Invalid data, needs investigation |
|
||||
| Dataset date missing | ✅ Update | Can't determine staleness |
|
||||
| DataCite API error | ✅ Update | Better safe than sorry |
|
||||
| `--force` flag used | ✅ Update | Override all logic |
|
||||
|
||||
## Environment Configuration
|
||||
|
||||
Required environment variables:
|
||||
|
||||
```bash
|
||||
# DataCite Credentials
|
||||
DATACITE_USERNAME=your_username
|
||||
DATACITE_PASSWORD=your_password
|
||||
|
||||
# API Endpoints (environment-specific)
|
||||
DATACITE_API_URL=https://api.test.datacite.org # Test environment
|
||||
DATACITE_SERVICE_URL=https://mds.test.datacite.org # Test MDS
|
||||
|
||||
DATACITE_API_URL=https://api.datacite.org # Production
|
||||
DATACITE_SERVICE_URL=https://mds.datacite.org # Production MDS
|
||||
|
||||
# Project Configuration
|
||||
DATACITE_PREFIX=10.21388 # Your DOI prefix
|
||||
BASE_DOMAIN=tethys.at # Your domain
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The command handles various error scenarios:
|
||||
|
||||
- **Invalid modification dates**: Logs errors but continues processing other datasets
|
||||
- **DataCite API failures**: Falls back to MDS API, then to safe update
|
||||
- **Missing DOI identifiers**: Skips datasets without DOI identifiers
|
||||
- **Network issues**: Continues with next dataset after logging error
|
||||
|
||||
## Integration
|
||||
|
||||
The command integrates with:
|
||||
|
||||
- **Dataset Model**: Uses `server_date_modified` for change detection
|
||||
- **DatasetIdentifier Model**: Reads DOI values and status
|
||||
- **OpenSearch Index**: Updates search index after DataCite update
|
||||
- **DoiClient**: Handles all DataCite API interactions
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Daily Maintenance
|
||||
```bash
|
||||
# Update any datasets modified today
|
||||
node ace update:datacite
|
||||
```
|
||||
|
||||
### Pre-Deployment Check
|
||||
```bash
|
||||
# Check what would be updated before deployment
|
||||
node ace update:datacite --dry-run
|
||||
```
|
||||
|
||||
### Debugging Sync Issues
|
||||
```bash
|
||||
# Investigate why specific dataset isn't syncing
|
||||
node ace update:datacite --stats --publish_id 231
|
||||
```
|
||||
|
||||
### Full Resync
|
||||
```bash
|
||||
# Force update all DOI records (use with caution)
|
||||
node ace update:datacite --force
|
||||
```
|
||||
|
||||
### Monitoring Report
|
||||
```bash
|
||||
# Generate sync status report
|
||||
node ace update:datacite --stats > datacite-sync-report.txt
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Regular Updates**: Run daily or after bulk dataset modifications
|
||||
2. **Test First**: Use `--dry-run` or `--stats` before bulk operations
|
||||
3. **Monitor Logs**: Check for data integrity warnings
|
||||
4. **Environment Separation**: Use correct API URLs for test vs production
|
||||
5. **Rate Limiting**: The command handles DataCite rate limits automatically
|
||||
222
freshclam.conf
222
freshclam.conf
|
|
@ -1,229 +1,47 @@
|
|||
##
|
||||
## Example config file for freshclam
|
||||
## Please read the freshclam.conf(5) manual before editing this file.
|
||||
## Container-optimized freshclam configuration
|
||||
##
|
||||
|
||||
|
||||
# Comment or remove the line below.
|
||||
|
||||
# Path to the database directory.
|
||||
# WARNING: It must match clamd.conf's directive!
|
||||
# Default: hardcoded (depends on installation options)
|
||||
# Database directory
|
||||
DatabaseDirectory /var/lib/clamav
|
||||
|
||||
# Path to the log file (make sure it has proper permissions)
|
||||
# Default: disabled
|
||||
# Log to stdout for container logging
|
||||
# UpdateLogFile /dev/stdout
|
||||
|
||||
# Maximum size of the log file.
|
||||
# Value of 0 disables the limit.
|
||||
# You may use 'M' or 'm' for megabytes (1M = 1m = 1048576 bytes)
|
||||
# and 'K' or 'k' for kilobytes (1K = 1k = 1024 bytes).
|
||||
# in bytes just don't use modifiers. If LogFileMaxSize is enabled,
|
||||
# log rotation (the LogRotate option) will always be enabled.
|
||||
# Default: 1M
|
||||
#LogFileMaxSize 2M
|
||||
|
||||
# Log time with each message.
|
||||
# Default: no
|
||||
# Basic logging settings
|
||||
LogTime yes
|
||||
|
||||
# Enable verbose logging.
|
||||
# Default: no
|
||||
LogVerbose yes
|
||||
|
||||
# Use system logger (can work together with UpdateLogFile).
|
||||
# Default: no
|
||||
LogVerbose no
|
||||
LogSyslog no
|
||||
|
||||
# Specify the type of syslog messages - please refer to 'man syslog'
|
||||
# for facility names.
|
||||
# Default: LOG_LOCAL6
|
||||
#LogFacility LOG_MAIL
|
||||
|
||||
# Enable log rotation. Always enabled when LogFileMaxSize is enabled.
|
||||
# Default: no
|
||||
#LogRotate yes
|
||||
|
||||
# This option allows you to save the process identifier of the daemon
|
||||
# Default: disabled
|
||||
#PidFile /var/run/freshclam.pid
|
||||
# PID file location
|
||||
PidFile /var/run/clamav/freshclam.pid
|
||||
|
||||
# By default when started freshclam drops privileges and switches to the
|
||||
# "clamav" user. This directive allows you to change the database owner.
|
||||
# Default: clamav (may depend on installation options)
|
||||
DatabaseOwner node
|
||||
# Database owner
|
||||
DatabaseOwner clamav
|
||||
|
||||
# Use DNS to verify virus database version. Freshclam uses DNS TXT records
|
||||
# to verify database and software versions. With this directive you can change
|
||||
# the database verification domain.
|
||||
# WARNING: Do not touch it unless you're configuring freshclam to use your
|
||||
# own database verification domain.
|
||||
# Default: current.cvd.clamav.net
|
||||
#DNSDatabaseInfo current.cvd.clamav.net
|
||||
|
||||
# Uncomment the following line and replace XY with your country
|
||||
# code. See http://www.iana.org/cctld/cctld-whois.htm for the full list.
|
||||
# You can use db.XY.ipv6.clamav.net for IPv6 connections.
|
||||
# Mirror settings for Austria
|
||||
DatabaseMirror db.at.clamav.net
|
||||
|
||||
# database.clamav.net is a round-robin record which points to our most
|
||||
# reliable mirrors. It's used as a fall back in case db.XY.clamav.net is
|
||||
# not working. DO NOT TOUCH the following line unless you know what you
|
||||
# are doing.
|
||||
DatabaseMirror database.clamav.net
|
||||
|
||||
# How many attempts to make before giving up.
|
||||
# Default: 3 (per mirror)
|
||||
#MaxAttempts 5
|
||||
|
||||
# With this option you can control scripted updates. It's highly recommended
|
||||
# to keep it enabled.
|
||||
# Default: yes
|
||||
#ScriptedUpdates yes
|
||||
|
||||
# By default freshclam will keep the local databases (.cld) uncompressed to
|
||||
# make their handling faster. With this option you can enable the compression;
|
||||
# the change will take effect with the next database update.
|
||||
# Default: no
|
||||
#CompressLocalDatabase no
|
||||
|
||||
# With this option you can provide custom sources (http:// or file://) for
|
||||
# database files. This option can be used multiple times.
|
||||
# Default: no custom URLs
|
||||
#DatabaseCustomURL http://myserver.com/mysigs.ndb
|
||||
#DatabaseCustomURL file:///mnt/nfs/local.hdb
|
||||
|
||||
# This option allows you to easily point freshclam to private mirrors.
|
||||
# If PrivateMirror is set, freshclam does not attempt to use DNS
|
||||
# to determine whether its databases are out-of-date, instead it will
|
||||
# use the If-Modified-Since request or directly check the headers of the
|
||||
# remote database files. For each database, freshclam first attempts
|
||||
# to download the CLD file. If that fails, it tries to download the
|
||||
# CVD file. This option overrides DatabaseMirror, DNSDatabaseInfo
|
||||
# and ScriptedUpdates. It can be used multiple times to provide
|
||||
# fall-back mirrors.
|
||||
# Default: disabled
|
||||
#PrivateMirror mirror1.mynetwork.com
|
||||
#PrivateMirror mirror2.mynetwork.com
|
||||
# Update settings
|
||||
ScriptedUpdates yes
|
||||
|
||||
# Number of database checks per day.
|
||||
# Default: 12 (every two hours)
|
||||
#Checks 24
|
||||
Checks 12
|
||||
|
||||
# Proxy settings
|
||||
# Default: disabled
|
||||
#HTTPProxyServer myproxy.com
|
||||
#HTTPProxyPort 1234
|
||||
#HTTPProxyUsername myusername
|
||||
#HTTPProxyPassword mypass
|
||||
|
||||
# If your servers are behind a firewall/proxy which applies User-Agent
|
||||
# filtering you can use this option to force the use of a different
|
||||
# User-Agent header.
|
||||
# Default: clamav/version_number
|
||||
#HTTPUserAgent SomeUserAgentIdString
|
||||
|
||||
# Use aaa.bbb.ccc.ddd as client address for downloading databases. Useful for
|
||||
# multi-homed systems.
|
||||
# Default: Use OS'es default outgoing IP address.
|
||||
#LocalIPAddress aaa.bbb.ccc.ddd
|
||||
|
||||
# Send the RELOAD command to clamd.
|
||||
# Default: no
|
||||
#NotifyClamd /path/to/clamd.conf
|
||||
|
||||
# Run command after successful database update.
|
||||
# Default: disabled
|
||||
#OnUpdateExecute command
|
||||
|
||||
# Run command when database update process fails.
|
||||
# Default: disabled
|
||||
#OnErrorExecute command
|
||||
|
||||
# Run command when freshclam reports outdated version.
|
||||
# In the command string %v will be replaced by the new version number.
|
||||
# Default: disabled
|
||||
#OnOutdatedExecute command
|
||||
|
||||
# Don't fork into background.
|
||||
# Default: no
|
||||
# Don't fork (good for containers)
|
||||
Foreground no
|
||||
|
||||
# Enable debug messages in libclamav.
|
||||
# Default: no
|
||||
#Debug yes
|
||||
# Connection timeouts
|
||||
ConnectTimeout 60
|
||||
ReceiveTimeout 60
|
||||
|
||||
# Timeout in seconds when connecting to database server.
|
||||
# Default: 30
|
||||
#ConnectTimeout 60
|
||||
# Test databases before using them
|
||||
TestDatabases yes
|
||||
|
||||
# Timeout in seconds when reading from database server.
|
||||
# Default: 30
|
||||
#ReceiveTimeout 60
|
||||
|
||||
# With this option enabled, freshclam will attempt to load new
|
||||
# databases into memory to make sure they are properly handled
|
||||
# by libclamav before replacing the old ones.
|
||||
# Default: yes
|
||||
#TestDatabases yes
|
||||
|
||||
# When enabled freshclam will submit statistics to the ClamAV Project about
|
||||
# the latest virus detections in your environment. The ClamAV maintainers
|
||||
# will then use this data to determine what types of malware are the most
|
||||
# detected in the field and in what geographic area they are.
|
||||
# Freshclam will connect to clamd in order to get recent statistics.
|
||||
# Default: no
|
||||
#SubmitDetectionStats /path/to/clamd.conf
|
||||
|
||||
# Country of origin of malware/detection statistics (for statistical
|
||||
# purposes only). The statistics collector at ClamAV.net will look up
|
||||
# your IP address to determine the geographical origin of the malware
|
||||
# reported by your installation. If this installation is mainly used to
|
||||
# scan data which comes from a different location, please enable this
|
||||
# option and enter a two-letter code (see http://www.iana.org/domains/root/db/)
|
||||
# of the country of origin.
|
||||
# Default: disabled
|
||||
#DetectionStatsCountry country-code
|
||||
|
||||
# This option enables support for our "Personal Statistics" service.
|
||||
# When this option is enabled, the information on malware detected by
|
||||
# your clamd installation is made available to you through our website.
|
||||
# To get your HostID, log on http://www.stats.clamav.net and add a new
|
||||
# host to your host list. Once you have the HostID, uncomment this option
|
||||
# and paste the HostID here. As soon as your freshclam starts submitting
|
||||
# information to our stats collecting service, you will be able to view
|
||||
# the statistics of this clamd installation by logging into
|
||||
# http://www.stats.clamav.net with the same credentials you used to
|
||||
# generate the HostID. For more information refer to:
|
||||
# http://www.clamav.net/documentation.html#cctts
|
||||
# This feature requires SubmitDetectionStats to be enabled.
|
||||
# Default: disabled
|
||||
#DetectionStatsHostID unique-id
|
||||
|
||||
# This option enables support for Google Safe Browsing. When activated for
|
||||
# the first time, freshclam will download a new database file (safebrowsing.cvd)
|
||||
# which will be automatically loaded by clamd and clamscan during the next
|
||||
# reload, provided that the heuristic phishing detection is turned on. This
|
||||
# database includes information about websites that may be phishing sites or
|
||||
# possible sources of malware. When using this option, it's mandatory to run
|
||||
# freshclam at least every 30 minutes.
|
||||
# Freshclam uses the ClamAV's mirror infrastructure to distribute the
|
||||
# database and its updates but all the contents are provided under Google's
|
||||
# terms of use. See http://www.google.com/transparencyreport/safebrowsing
|
||||
# and http://www.clamav.net/documentation.html#safebrowsing
|
||||
# for more information.
|
||||
# Default: disabled
|
||||
#SafeBrowsing yes
|
||||
|
||||
# This option enables downloading of bytecode.cvd, which includes additional
|
||||
# detection mechanisms and improvements to the ClamAV engine.
|
||||
# Default: enabled
|
||||
#Bytecode yes
|
||||
|
||||
# Download an additional 3rd party signature database distributed through
|
||||
# the ClamAV mirrors.
|
||||
# This option can be used multiple times.
|
||||
#ExtraDatabase dbname1
|
||||
#ExtraDatabase dbname2
|
||||
# Enable bytecode signatures
|
||||
Bytecode yes
|
||||
1044
package-lock.json
generated
1044
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
|
@ -59,7 +59,6 @@
|
|||
"hot-hook": "^0.4.0",
|
||||
"numeral": "^2.0.6",
|
||||
"pinia": "^3.0.2",
|
||||
"pino-pretty": "^13.0.0",
|
||||
"postcss-loader": "^8.1.1",
|
||||
"prettier": "^3.4.2",
|
||||
"supertest": "^6.3.3",
|
||||
|
|
@ -115,7 +114,9 @@
|
|||
"node-2fa": "^2.0.3",
|
||||
"node-exceptions": "^4.0.1",
|
||||
"notiwind": "^2.0.0",
|
||||
"p-limit": "^7.1.1",
|
||||
"pg": "^8.9.0",
|
||||
"pino-pretty": "^13.0.0",
|
||||
"qrcode": "^1.5.3",
|
||||
"redis": "^5.0.0",
|
||||
"reflect-metadata": "^0.2.1",
|
||||
|
|
|
|||
|
|
@ -6,17 +6,16 @@
|
|||
import type { ApplicationService } from '@adonisjs/core/types';
|
||||
import vine, { symbols, BaseLiteralType, Vine } from '@vinejs/vine';
|
||||
import type { FieldContext, FieldOptions } from '@vinejs/vine/types';
|
||||
// import type { MultipartFile, FileValidationOptions } from '@adonisjs/bodyparser/types';
|
||||
import type { MultipartFile } from '@adonisjs/core/bodyparser';
|
||||
import type { FileValidationOptions } from '@adonisjs/core/types/bodyparser';
|
||||
import { Request, RequestValidator } from '@adonisjs/core/http';
|
||||
import MimeType from '#models/mime_type';
|
||||
|
||||
|
||||
/**
|
||||
* Validation options accepted by the "file" rule
|
||||
*/
|
||||
export type FileRuleValidationOptions = Partial<FileValidationOptions> | ((field: FieldContext) => Partial<FileValidationOptions>);
|
||||
|
||||
/**
|
||||
* Extend VineJS
|
||||
*/
|
||||
|
|
@ -25,6 +24,7 @@ declare module '@vinejs/vine' {
|
|||
myfile(options?: FileRuleValidationOptions): VineMultipartFile;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend HTTP request class
|
||||
*/
|
||||
|
|
@ -36,19 +36,54 @@ declare module '@adonisjs/core/http' {
|
|||
* Checks if the value is an instance of multipart file
|
||||
* from bodyparser.
|
||||
*/
|
||||
export function isBodyParserFile(file: MultipartFile | unknown): boolean {
|
||||
export function isBodyParserFile(file: MultipartFile | unknown): file is MultipartFile {
|
||||
return !!(file && typeof file === 'object' && 'isMultipartFile' in file);
|
||||
}
|
||||
export async function getEnabledExtensions() {
|
||||
const enabledExtensions = await MimeType.query().select('file_extension').where('enabled', true).exec();
|
||||
const extensions = enabledExtensions
|
||||
.map((extension) => {
|
||||
return extension.file_extension.split('|');
|
||||
})
|
||||
.flat();
|
||||
|
||||
return extensions;
|
||||
/**
|
||||
* Cache for enabled extensions to reduce database queries
|
||||
*/
|
||||
let extensionsCache: string[] | null = null;
|
||||
let cacheTimestamp = 0;
|
||||
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
/**
|
||||
* Get enabled extensions with caching
|
||||
*/
|
||||
export async function getEnabledExtensions(): Promise<string[]> {
|
||||
const now = Date.now();
|
||||
|
||||
if (extensionsCache && now - cacheTimestamp < CACHE_DURATION) {
|
||||
return extensionsCache;
|
||||
}
|
||||
|
||||
try {
|
||||
const enabledExtensions = await MimeType.query().select('file_extension').where('enabled', true).exec();
|
||||
|
||||
const extensions = enabledExtensions
|
||||
.map((extension) => extension.file_extension.split('|'))
|
||||
.flat()
|
||||
.map((ext) => ext.toLowerCase().trim())
|
||||
.filter((ext) => ext.length > 0);
|
||||
|
||||
extensionsCache = [...new Set(extensions)]; // Remove duplicates
|
||||
cacheTimestamp = now;
|
||||
|
||||
return extensionsCache;
|
||||
} catch (error) {
|
||||
console.error('Error fetching enabled extensions:', error);
|
||||
return extensionsCache || [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear extensions cache
|
||||
*/
|
||||
export function clearExtensionsCache(): void {
|
||||
extensionsCache = null;
|
||||
cacheTimestamp = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* VineJS validation rule that validates the file to be an
|
||||
* instance of BodyParser MultipartFile class.
|
||||
|
|
@ -65,6 +100,7 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op
|
|||
// At this point, you can use type assertion to explicitly tell TypeScript that file is of type MultipartFile
|
||||
const validatedFile = file as MultipartFile;
|
||||
const validationOptions = typeof options === 'function' ? options(field) : options;
|
||||
|
||||
/**
|
||||
* Set size when it's defined in the options and missing
|
||||
* on the file instance
|
||||
|
|
@ -72,30 +108,29 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op
|
|||
if (validatedFile.sizeLimit === undefined && validationOptions.size) {
|
||||
validatedFile.sizeLimit = validationOptions.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set extensions when it's defined in the options and missing
|
||||
* on the file instance
|
||||
*/
|
||||
// if (validatedFile.allowedExtensions === undefined && validationOptions.extnames) {
|
||||
// validatedFile.allowedExtensions = validationOptions.extnames;
|
||||
// }
|
||||
if (validatedFile.allowedExtensions === undefined && validationOptions.extnames !== undefined) {
|
||||
validatedFile.allowedExtensions = validationOptions.extnames; // await getEnabledExtensions();
|
||||
} else if (validatedFile.allowedExtensions === undefined && validationOptions.extnames === undefined) {
|
||||
validatedFile.allowedExtensions = await getEnabledExtensions();
|
||||
if (validatedFile.allowedExtensions === undefined) {
|
||||
if (validationOptions.extnames !== undefined) {
|
||||
validatedFile.allowedExtensions = validationOptions.extnames;
|
||||
} else {
|
||||
validatedFile.allowedExtensions = await getEnabledExtensions();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* wieder löschen
|
||||
* Set extensions when it's defined in the options and missing
|
||||
* on the file instance
|
||||
*/
|
||||
// if (file.clientNameSizeLimit === undefined && validationOptions.clientNameSizeLimit) {
|
||||
// file.clientNameSizeLimit = validationOptions.clientNameSizeLimit;
|
||||
// }
|
||||
|
||||
/**
|
||||
* Validate file
|
||||
*/
|
||||
validatedFile.validate();
|
||||
try {
|
||||
validatedFile.validate();
|
||||
} catch (error) {
|
||||
field.report(`File validation failed: ${error.message}`, 'file.validation_error', field, validationOptions);
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Report errors
|
||||
*/
|
||||
|
|
@ -107,36 +142,37 @@ const isMultipartFile = vine.createRule(async (file: MultipartFile | unknown, op
|
|||
const MULTIPART_FILE: typeof symbols.SUBTYPE = symbols.SUBTYPE;
|
||||
|
||||
export class VineMultipartFile extends BaseLiteralType<MultipartFile, MultipartFile, MultipartFile> {
|
||||
|
||||
[MULTIPART_FILE]: string;
|
||||
// constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions) {
|
||||
// super(options, [isMultipartFile(validationOptions || {})]);
|
||||
// this.validationOptions = validationOptions;
|
||||
// this.#private = true;
|
||||
// }
|
||||
|
||||
// clone(): this {
|
||||
// return new VineMultipartFile(this.validationOptions, this.cloneOptions()) as this;
|
||||
// }
|
||||
// #private;
|
||||
// constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions, validations?: Validation<any>[]);
|
||||
// clone(): this;
|
||||
|
||||
public validationOptions;
|
||||
public validationOptions?: FileRuleValidationOptions;
|
||||
// extnames: (18) ['gpkg', 'htm', 'html', 'csv', 'txt', 'asc', 'c', 'cc', 'h', 'srt', 'tiff', 'pdf', 'png', 'zip', 'jpg', 'jpeg', 'jpe', 'xlsx']
|
||||
// size: '512mb'
|
||||
|
||||
// public constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions, validations?: Validation<any>[]) {
|
||||
public constructor(validationOptions?: FileRuleValidationOptions, options?: FieldOptions) {
|
||||
// super(options, validations);
|
||||
super(options, [isMultipartFile(validationOptions || {})]);
|
||||
this.validationOptions = validationOptions;
|
||||
}
|
||||
|
||||
public clone(): any {
|
||||
// return new VineMultipartFile(this.validationOptions, this.cloneOptions(), this.cloneValidations());
|
||||
return new VineMultipartFile(this.validationOptions, this.cloneOptions());
|
||||
}
|
||||
|
||||
/**
|
||||
* Set maximum file size
|
||||
*/
|
||||
public maxSize(size: string | number): this {
|
||||
const newOptions = { ...this.validationOptions, size };
|
||||
return new VineMultipartFile(newOptions, this.cloneOptions()) as this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set allowed extensions
|
||||
*/
|
||||
public extensions(extnames: string[]): this {
|
||||
const newOptions = { ...this.validationOptions, extnames };
|
||||
return new VineMultipartFile(newOptions, this.cloneOptions()) as this;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
export default class VinejsProvider {
|
||||
|
|
@ -155,13 +191,8 @@ export default class VinejsProvider {
|
|||
/**
|
||||
* The container bindings have booted
|
||||
*/
|
||||
|
||||
boot(): void {
|
||||
// VineString.macro('translatedLanguage', function (this: VineString, options: Options) {
|
||||
// return this.use(translatedLanguageRule(options));
|
||||
// });
|
||||
|
||||
Vine.macro('myfile', function (this: Vine, options) {
|
||||
Vine.macro('myfile', function (this: Vine, options?: FileRuleValidationOptions) {
|
||||
return new VineMultipartFile(options);
|
||||
});
|
||||
|
||||
|
|
@ -175,6 +206,41 @@ export default class VinejsProvider {
|
|||
}
|
||||
return new RequestValidator(this.ctx).validateUsing(...args);
|
||||
});
|
||||
|
||||
// Ensure MIME validation macros are loaded
|
||||
this.loadMimeValidationMacros();
|
||||
this.loadFileScanMacros();
|
||||
this.loadFileLengthMacros();
|
||||
}
|
||||
|
||||
/**
|
||||
* Load MIME validation macros - called during boot to ensure they're available
|
||||
*/
|
||||
private async loadMimeValidationMacros(): Promise<void> {
|
||||
try {
|
||||
// Dynamically import the MIME validation rule to ensure macros are registered
|
||||
await import('#start/rules/allowed_extensions_mimetypes');
|
||||
} catch (error) {
|
||||
console.warn('Could not load MIME validation macros:', error);
|
||||
}
|
||||
}
|
||||
|
||||
private async loadFileScanMacros(): Promise<void> {
|
||||
try {
|
||||
// Dynamically import the MIME validation rule to ensure macros are registered
|
||||
await import('#start/rules/file_scan');
|
||||
} catch (error) {
|
||||
console.warn('Could not load MIME validation macros:', error);
|
||||
}
|
||||
}
|
||||
|
||||
private async loadFileLengthMacros(): Promise<void> {
|
||||
try {
|
||||
// Dynamically import the MIME validation rule to ensure macros are registered
|
||||
await import('#start/rules/file_length');
|
||||
} catch (error) {
|
||||
console.warn('Could not load MIME validation macros:', error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -190,5 +256,7 @@ export default class VinejsProvider {
|
|||
/**
|
||||
* Preparing to shutdown the app
|
||||
*/
|
||||
async shutdown() {}
|
||||
async shutdown() {
|
||||
clearExtensionsCache();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
174
readme.md
174
readme.md
|
|
@ -11,6 +11,8 @@ Welcome to the Tethys Research Repository Backend System! This is the backend co
|
|||
- [Configuration](#configuration)
|
||||
- [Database](#database)
|
||||
- [API Documentation](#api-documentation)
|
||||
- [Commands](#commands)
|
||||
- [Documentation](#documentation)
|
||||
- [Contributing](#contributing)
|
||||
- [License](#license)
|
||||
|
||||
|
|
@ -29,5 +31,175 @@ Before you begin, ensure you have met the following requirements:
|
|||
1. Clone this repository:
|
||||
|
||||
```bash
|
||||
git clone https://gitea.geologie.ac.at/geolba/tethys.backend.git
|
||||
git clone git clone https://gitea.geologie.ac.at/geolba/tethys.backend.git
|
||||
cd tethys-backend
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
3. Configure environment variables (see [Configuration](#configuration))
|
||||
|
||||
4. Run database migrations:
|
||||
|
||||
```bash
|
||||
node ace migration:run
|
||||
```
|
||||
|
||||
5. Start the development server:
|
||||
|
||||
```bash
|
||||
npm run dev
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
The Tethys Backend provides RESTful APIs for managing research datasets, user authentication, DOI registration, and search functionality.
|
||||
|
||||
## Configuration
|
||||
|
||||
Copy the `.env.example` file to `.env` and configure the following variables:
|
||||
|
||||
### Database Configuration
|
||||
```bash
|
||||
DB_CONNECTION=pg
|
||||
DB_HOST=localhost
|
||||
DB_PORT=5432
|
||||
DB_USER=your_username
|
||||
DB_PASSWORD=your_password
|
||||
DB_DATABASE=tethys_db
|
||||
```
|
||||
|
||||
### DataCite Configuration
|
||||
```bash
|
||||
# DataCite Credentials
|
||||
DATACITE_USERNAME=your_datacite_username
|
||||
DATACITE_PASSWORD=your_datacite_password
|
||||
DATACITE_PREFIX=10.21388
|
||||
|
||||
# Environment-specific API endpoints
|
||||
DATACITE_API_URL=https://api.test.datacite.org # Test environment
|
||||
DATACITE_SERVICE_URL=https://mds.test.datacite.org # Test MDS
|
||||
|
||||
# For production:
|
||||
# DATACITE_API_URL=https://api.datacite.org
|
||||
# DATACITE_SERVICE_URL=https://mds.datacite.org
|
||||
```
|
||||
|
||||
### OpenSearch Configuration
|
||||
```bash
|
||||
OPENSEARCH_HOST=localhost:9200
|
||||
```
|
||||
|
||||
### Application Configuration
|
||||
```bash
|
||||
BASE_DOMAIN=tethys.at
|
||||
APP_KEY=your_app_key
|
||||
```
|
||||
|
||||
## Database
|
||||
|
||||
The system uses PostgreSQL with Lucid ORM. Key models include:
|
||||
|
||||
- **Dataset**: Research dataset metadata
|
||||
- **DatasetIdentifier**: DOI and other identifiers for datasets
|
||||
- **User**: User management and authentication
|
||||
- **XmlCache**: Cached XML metadata
|
||||
|
||||
Run migrations and seeders:
|
||||
|
||||
```bash
|
||||
# Run migrations
|
||||
node ace migration:run
|
||||
|
||||
# Run seeders (if available)
|
||||
node ace db:seed
|
||||
```
|
||||
|
||||
## API Documentation
|
||||
|
||||
API endpoints are available for:
|
||||
|
||||
- Dataset management (`/api/datasets`)
|
||||
- User authentication (`/api/auth`)
|
||||
- DOI registration (`/api/doi`)
|
||||
- Search functionality (`/api/search`)
|
||||
|
||||
*Detailed API documentation can be found in the `/docs/api` directory.*
|
||||
|
||||
## Commands
|
||||
|
||||
The system includes several Ace commands for maintenance and data management:
|
||||
|
||||
### Dataset Indexing
|
||||
```bash
|
||||
# Index all published datasets to OpenSearch
|
||||
node ace index:datasets
|
||||
|
||||
# Index a specific dataset
|
||||
node ace index:datasets --publish_id 123
|
||||
```
|
||||
|
||||
### DataCite DOI Management
|
||||
```bash
|
||||
# Update DataCite records for modified datasets
|
||||
node ace update:datacite
|
||||
|
||||
# Show detailed statistics for datasets needing updates
|
||||
node ace update:datacite --stats
|
||||
|
||||
# Preview what would be updated (dry run)
|
||||
node ace update:datacite --dry-run
|
||||
|
||||
# Force update all DOI records
|
||||
node ace update:datacite --force
|
||||
|
||||
# Update a specific dataset
|
||||
node ace update:datacite --publish_id 123
|
||||
```
|
||||
|
||||
*For detailed command documentation, see the [Commands Documentation](docs/commands/)*
|
||||
|
||||
## Documentation
|
||||
|
||||
Comprehensive documentation is available in the `/docs` directory:
|
||||
|
||||
- **[Commands Documentation](docs/commands/)** - Detailed guides for Ace commands
|
||||
- [DataCite Update Command](docs/commands/update-datacite.md) - DOI synchronization and management
|
||||
- [Dataset Indexing Command](docs/commands/index-datasets.md) - Search index management
|
||||
- **[API Documentation](docs/api/)** - REST API endpoints and usage
|
||||
- **[Deployment Guide](docs/deployment/)** - Production deployment instructions
|
||||
- **[Configuration Guide](docs/configuration/)** - Environment setup and configuration options
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
||||
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
|
||||
4. Push to the branch (`git push origin feature/amazing-feature`)
|
||||
5. Open a Pull Request
|
||||
|
||||
### Development Guidelines
|
||||
|
||||
- Follow the existing code style and conventions
|
||||
- Write tests for new features
|
||||
- Update documentation for any API changes
|
||||
- Ensure all commands and migrations work properly
|
||||
|
||||
### Testing Commands
|
||||
|
||||
```bash
|
||||
# Run tests
|
||||
npm test
|
||||
|
||||
# Test specific commands
|
||||
node ace update:datacite --dry-run --publish_id 123
|
||||
node ace index:datasets --publish_id 123
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the [MIT License](LICENSE).
|
||||
|
|
@ -163,7 +163,7 @@
|
|||
</div>
|
||||
</FormControl>
|
||||
</FormField>
|
||||
<FormField label="Main Title Language*" help="required: main abstract language"
|
||||
<FormField label="Main Description Language*" help="required: main abstract language"
|
||||
:class="{ 'text-red-400': form.errors['descriptions.0.language'] }"
|
||||
class="w-full ml-1 flex-1">
|
||||
<FormControl required v-model="form.descriptions[0].language" type="text"
|
||||
|
|
|
|||
|
|
@ -725,7 +725,7 @@ Removes a selected keyword
|
|||
</div>
|
||||
</FormControl>
|
||||
</FormField>
|
||||
<FormField label="Main Title Language*" help="required: main abstract language"
|
||||
<FormField label="Main Description Language*" help="required: main abstract language"
|
||||
:class="{ 'text-red-400': form.errors['descriptions.0.language'] }"
|
||||
class="w-full mx-2 flex-1">
|
||||
<FormControl required v-model="form.descriptions[0].language" type="text"
|
||||
|
|
|
|||
|
|
@ -272,7 +272,7 @@
|
|||
</FormControl>
|
||||
</FormField>
|
||||
<FormField
|
||||
label="Main Title Language*"
|
||||
label="Main Description Language*"
|
||||
help="required: main abstract language"
|
||||
:class="{ 'text-red-400': form.errors['descriptions.0.language'] }"
|
||||
class="w-full ml-1 flex-1"
|
||||
|
|
|
|||
|
|
@ -8,14 +8,24 @@ import AvatarController from '#controllers/Http/Api/AvatarController';
|
|||
import UserController from '#controllers/Http/Api/UserController';
|
||||
import CollectionsController from '#controllers/Http/Api/collections_controller';
|
||||
import { middleware } from '../kernel.js';
|
||||
// API
|
||||
|
||||
// Clean DOI URL routes (no /api prefix)
|
||||
|
||||
// API routes with /api prefix
|
||||
router
|
||||
.group(() => {
|
||||
router.get('clients', [UserController, 'getSubmitters']).as('client.index').use(middleware.auth());;
|
||||
router.get('authors', [AuthorsController, 'index']).as('author.index').use(middleware.auth());;
|
||||
router.get('clients', [UserController, 'getSubmitters']).as('client.index').use(middleware.auth());
|
||||
router.get('authors', [AuthorsController, 'index']).as('author.index').use(middleware.auth());
|
||||
router.get('datasets', [DatasetController, 'index']).as('dataset.index');
|
||||
router.get('persons', [AuthorsController, 'persons']).as('author.persons');
|
||||
|
||||
// This should come BEFORE any other routes that might conflict
|
||||
router
|
||||
.get('/dataset/:prefix/:value', [DatasetController, 'findByIdentifier'])
|
||||
.where('prefix', /^10\.\d+$/) // Match DOI prefix pattern (10.xxxx)
|
||||
.where('value', /^[a-zA-Z0-9._-]+\.[0-9]+(?:\.[0-9]+)*$/) // Match DOI suffix pattern
|
||||
.as('dataset.findByIdentifier');
|
||||
|
||||
router.get('/dataset', [DatasetController, 'findAll']).as('dataset.findAll');
|
||||
router.get('/dataset/:publish_id', [DatasetController, 'findOne']).as('dataset.findOne');
|
||||
router.get('/sitelinks/:year', [HomeController, 'findDocumentsPerYear']);
|
||||
|
|
@ -35,7 +45,7 @@ router
|
|||
.as('apps.twofactor_backupcodes.create')
|
||||
.use(middleware.auth());
|
||||
|
||||
router.get('collections/:id', [CollectionsController, 'show']).as('collection.show')
|
||||
router.get('collections/:id', [CollectionsController, 'show']).as('collection.show');
|
||||
})
|
||||
// .namespace('App/Controllers/Http/Api')
|
||||
.prefix('api');
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
|--------------------------------------------------------------------------
|
||||
| Preloaded File - node ace make:preload rules/orcid
|
||||
| ❯ Do you want to register the preload file in .adonisrc.ts file? (y/N) · true
|
||||
| Do you want to register the preload file in .adonisrc.ts file? (y/N) · true
|
||||
| DONE: create start/rules/orcid.ts
|
||||
| DONE: update adonisrc.ts file
|
||||
|--------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue