- feat: Enhance README with setup instructions, usage, and command documentation
- fix: Update API routes to include DOI URL handling and improve route organization - chore: Add ORCID preload rule file and ensure proper registration - docs: Add MIT License to the project for open-source compliance - feat: Implement command to detect and fix missing dataset cross-references - feat: Create command for updating DataCite DOI records with detailed logging and error handling - docs: Add comprehensive documentation for dataset indexing command - docs: Create detailed documentation for DataCite update command with usage examples and error handling
This commit is contained in:
parent
8f67839f93
commit
c049b22723
11 changed files with 2187 additions and 555 deletions
22
LICENSE
Normal file
22
LICENSE
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 Tethys Research Repository
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE
|
||||||
|
|
@ -1,23 +1,35 @@
|
||||||
import type { HttpContext } from '@adonisjs/core/http';
|
import type { HttpContext } from '@adonisjs/core/http';
|
||||||
// import Person from 'App/Models/Person';
|
|
||||||
import Dataset from '#models/dataset';
|
import Dataset from '#models/dataset';
|
||||||
import { StatusCodes } from 'http-status-codes';
|
import { StatusCodes } from 'http-status-codes';
|
||||||
|
|
||||||
// node ace make:controller Author
|
// node ace make:controller Author
|
||||||
export default class DatasetController {
|
export default class DatasetController {
|
||||||
public async index({}: HttpContext) {
|
/**
|
||||||
// Select datasets with server_state 'published' or 'deleted' and sort by the last published date
|
* GET /api/datasets
|
||||||
const datasets = await Dataset.query()
|
* Find all published datasets
|
||||||
.where(function (query) {
|
*/
|
||||||
query.where('server_state', 'published').orWhere('server_state', 'deleted');
|
public async index({ response }: HttpContext) {
|
||||||
})
|
try {
|
||||||
.preload('titles')
|
const datasets = await Dataset.query()
|
||||||
.preload('identifier')
|
.where(function (query) {
|
||||||
.orderBy('server_date_published', 'desc');
|
query.where('server_state', 'published').orWhere('server_state', 'deleted');
|
||||||
|
})
|
||||||
|
.preload('titles')
|
||||||
|
.preload('identifier')
|
||||||
|
.orderBy('server_date_published', 'desc');
|
||||||
|
|
||||||
return datasets;
|
return response.status(StatusCodes.OK).json(datasets);
|
||||||
|
} catch (error) {
|
||||||
|
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
|
||||||
|
message: error.message || 'Some error occurred while retrieving datasets.',
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/dataset
|
||||||
|
* Find all published datasets
|
||||||
|
*/
|
||||||
public async findAll({ response }: HttpContext) {
|
public async findAll({ response }: HttpContext) {
|
||||||
try {
|
try {
|
||||||
const datasets = await Dataset.query()
|
const datasets = await Dataset.query()
|
||||||
|
|
@ -33,48 +45,142 @@ export default class DatasetController {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public async findOne({ params }: HttpContext) {
|
/**
|
||||||
const datasets = await Dataset.query()
|
* GET /api/dataset/:publish_id
|
||||||
.where('publish_id', params.publish_id)
|
* Find one dataset by publish_id
|
||||||
.preload('titles')
|
*/
|
||||||
.preload('descriptions')
|
public async findOne({ response, params }: HttpContext) {
|
||||||
.preload('user', (builder) => {
|
try {
|
||||||
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
|
const dataset = await Dataset.query()
|
||||||
})
|
.where('publish_id', params.publish_id)
|
||||||
.preload('authors', (builder) => {
|
.preload('titles')
|
||||||
builder
|
.preload('descriptions') // Using 'descriptions' instead of 'abstracts'
|
||||||
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
.preload('user', (builder) => {
|
||||||
.withCount('datasets', (query) => {
|
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
|
||||||
query.as('datasets_count');
|
})
|
||||||
})
|
.preload('authors', (builder) => {
|
||||||
.pivotColumns(['role', 'sort_order'])
|
builder
|
||||||
.orderBy('pivot_sort_order', 'asc');
|
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||||
})
|
.withCount('datasets', (query) => {
|
||||||
.preload('contributors', (builder) => {
|
query.as('datasets_count');
|
||||||
builder
|
})
|
||||||
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
.pivotColumns(['role', 'sort_order'])
|
||||||
.withCount('datasets', (query) => {
|
.orderBy('pivot_sort_order', 'asc');
|
||||||
query.as('datasets_count');
|
})
|
||||||
})
|
.preload('contributors', (builder) => {
|
||||||
.pivotColumns(['role', 'sort_order', 'contributor_type'])
|
builder
|
||||||
.orderBy('pivot_sort_order', 'asc');
|
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||||
})
|
.withCount('datasets', (query) => {
|
||||||
.preload('subjects')
|
query.as('datasets_count');
|
||||||
.preload('coverage')
|
})
|
||||||
.preload('licenses')
|
.pivotColumns(['role', 'sort_order', 'contributor_type'])
|
||||||
.preload('references')
|
.orderBy('pivot_sort_order', 'asc');
|
||||||
.preload('project')
|
})
|
||||||
.preload('referenced_by', (builder) => {
|
.preload('subjects')
|
||||||
builder.preload('dataset', (builder) => {
|
.preload('coverage')
|
||||||
builder.preload('identifier');
|
.preload('licenses')
|
||||||
});
|
.preload('references')
|
||||||
})
|
.preload('project')
|
||||||
.preload('files', (builder) => {
|
.preload('referenced_by', (builder) => {
|
||||||
builder.preload('hashvalues');
|
builder.preload('dataset', (builder) => {
|
||||||
})
|
builder.preload('identifier');
|
||||||
.preload('identifier')
|
});
|
||||||
.firstOrFail();
|
})
|
||||||
|
.preload('files', (builder) => {
|
||||||
|
builder.preload('hashvalues');
|
||||||
|
})
|
||||||
|
.preload('identifier')
|
||||||
|
.first(); // Use first() instead of firstOrFail() to handle not found gracefully
|
||||||
|
|
||||||
return datasets;
|
if (!dataset) {
|
||||||
|
return response.status(StatusCodes.NOT_FOUND).json({
|
||||||
|
message: `Cannot find Dataset with publish_id=${params.publish_id}.`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.status(StatusCodes.OK).json(dataset);
|
||||||
|
} catch (error) {
|
||||||
|
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
|
||||||
|
message: error.message || `Error retrieving Dataset with publish_id=${params.publish_id}.`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /:prefix/:value
|
||||||
|
* Find dataset by identifier (e.g., https://doi.tethys.at/10.24341/tethys.99.2)
|
||||||
|
*/
|
||||||
|
public async findByIdentifier({ response, params }: HttpContext) {
|
||||||
|
const identifierValue = `${params.prefix}/${params.value}`;
|
||||||
|
|
||||||
|
// Optional: Validate DOI format
|
||||||
|
if (!identifierValue.match(/^10\.\d+\/[a-zA-Z0-9._-]+\.[0-9]+(?:\.[0-9]+)*$/)) {
|
||||||
|
return response.status(StatusCodes.BAD_REQUEST).json({
|
||||||
|
message: `Invalid DOI format: ${identifierValue}`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Method 1: Using subquery with whereIn (most similar to your original)
|
||||||
|
const dataset = await Dataset.query()
|
||||||
|
// .whereIn('id', (subQuery) => {
|
||||||
|
// subQuery.select('dataset_id').from('dataset_identifiers').where('value', identifierValue);
|
||||||
|
// })
|
||||||
|
.whereHas('identifier', (builder) => {
|
||||||
|
builder.where('value', identifierValue);
|
||||||
|
})
|
||||||
|
.preload('titles')
|
||||||
|
.preload('descriptions') // Using 'descriptions' instead of 'abstracts'
|
||||||
|
.preload('user', (builder) => {
|
||||||
|
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
|
||||||
|
})
|
||||||
|
.preload('authors', (builder) => {
|
||||||
|
builder
|
||||||
|
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||||
|
.withCount('datasets', (query) => {
|
||||||
|
query.as('datasets_count');
|
||||||
|
})
|
||||||
|
.pivotColumns(['role', 'sort_order'])
|
||||||
|
.wherePivot('role', 'author')
|
||||||
|
.orderBy('pivot_sort_order', 'asc');
|
||||||
|
})
|
||||||
|
.preload('contributors', (builder) => {
|
||||||
|
builder
|
||||||
|
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
|
||||||
|
.withCount('datasets', (query) => {
|
||||||
|
query.as('datasets_count');
|
||||||
|
})
|
||||||
|
.pivotColumns(['role', 'sort_order', 'contributor_type'])
|
||||||
|
.wherePivot('role', 'contributor')
|
||||||
|
.orderBy('pivot_sort_order', 'asc');
|
||||||
|
})
|
||||||
|
.preload('subjects')
|
||||||
|
.preload('coverage')
|
||||||
|
.preload('licenses')
|
||||||
|
.preload('references')
|
||||||
|
.preload('project')
|
||||||
|
.preload('referenced_by', (builder) => {
|
||||||
|
builder.preload('dataset', (builder) => {
|
||||||
|
builder.preload('identifier');
|
||||||
|
});
|
||||||
|
})
|
||||||
|
.preload('files', (builder) => {
|
||||||
|
builder.preload('hashvalues');
|
||||||
|
})
|
||||||
|
.preload('identifier')
|
||||||
|
.first();
|
||||||
|
|
||||||
|
if (!dataset) {
|
||||||
|
return response.status(StatusCodes.NOT_FOUND).json({
|
||||||
|
message: `Cannot find Dataset with identifier=${identifierValue}.`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.status(StatusCodes.OK).json(dataset);
|
||||||
|
} catch (error) {
|
||||||
|
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
|
||||||
|
message: error.message || `Error retrieving Dataset with identifier=${identifierValue}.`,
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,3 @@
|
||||||
// import { Client } from 'guzzle';
|
|
||||||
// import { Log } from '@adonisjs/core/build/standalone';
|
|
||||||
// import { DoiInterface } from './interfaces/DoiInterface';
|
|
||||||
import DoiClientContract from '#app/Library/Doi/DoiClientContract';
|
import DoiClientContract from '#app/Library/Doi/DoiClientContract';
|
||||||
import DoiClientException from '#app/exceptions/DoiClientException';
|
import DoiClientException from '#app/exceptions/DoiClientException';
|
||||||
import { StatusCodes } from 'http-status-codes';
|
import { StatusCodes } from 'http-status-codes';
|
||||||
|
|
@ -12,14 +9,14 @@ export class DoiClient implements DoiClientContract {
|
||||||
public username: string;
|
public username: string;
|
||||||
public password: string;
|
public password: string;
|
||||||
public serviceUrl: string;
|
public serviceUrl: string;
|
||||||
|
public apiUrl: string;
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
// const datacite_environment = process.env.DATACITE_ENVIRONMENT || 'debug';
|
// const datacite_environment = process.env.DATACITE_ENVIRONMENT || 'debug';
|
||||||
this.username = process.env.DATACITE_USERNAME || '';
|
this.username = process.env.DATACITE_USERNAME || '';
|
||||||
this.password = process.env.DATACITE_PASSWORD || '';
|
this.password = process.env.DATACITE_PASSWORD || '';
|
||||||
this.serviceUrl = process.env.DATACITE_SERVICE_URL || '';
|
this.serviceUrl = process.env.DATACITE_SERVICE_URL || '';
|
||||||
// this.prefix = process.env.DATACITE_PREFIX || '';
|
this.apiUrl = process.env.DATACITE_API_URL || 'https://api.datacite.org';
|
||||||
// this.base_domain = process.env.BASE_DOMAIN || '';
|
|
||||||
|
|
||||||
if (this.username === '' || this.password === '' || this.serviceUrl === '') {
|
if (this.username === '' || this.password === '' || this.serviceUrl === '') {
|
||||||
const message = 'issing configuration settings to properly initialize DOI client';
|
const message = 'issing configuration settings to properly initialize DOI client';
|
||||||
|
|
@ -90,4 +87,240 @@ export class DoiClient implements DoiClientContract {
|
||||||
throw new DoiClientException(error.response.status, error.response.data);
|
throw new DoiClientException(error.response.status, error.response.data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves DOI information from DataCite REST API
|
||||||
|
*
|
||||||
|
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
|
||||||
|
* @returns Promise with DOI information or null if not found
|
||||||
|
*/
|
||||||
|
public async getDoiInfo(doiValue: string): Promise<any | null> {
|
||||||
|
try {
|
||||||
|
// Use configurable DataCite REST API URL
|
||||||
|
const dataciteApiUrl = `${this.apiUrl}/dois/${doiValue}`;
|
||||||
|
const response = await axios.get(dataciteApiUrl, {
|
||||||
|
headers: {
|
||||||
|
Accept: 'application/vnd.api+json',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.status === 200 && response.data.data) {
|
||||||
|
return {
|
||||||
|
created: response.data.data.attributes.created,
|
||||||
|
registered: response.data.data.attributes.registered,
|
||||||
|
updated: response.data.data.attributes.updated,
|
||||||
|
published: response.data.data.attributes.published,
|
||||||
|
state: response.data.data.attributes.state,
|
||||||
|
url: response.data.data.attributes.url,
|
||||||
|
metadata: response.data.data.attributes,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
if (error.response?.status === 404) {
|
||||||
|
logger.debug(`DOI ${doiValue} not found in DataCite`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug(`DataCite REST API failed for ${doiValue}: ${error.message}`);
|
||||||
|
|
||||||
|
// Fallback to MDS API
|
||||||
|
return await this.getDoiInfoFromMds(doiValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fallback method to get DOI info from MDS API
|
||||||
|
*
|
||||||
|
* @param doiValue The DOI identifier
|
||||||
|
* @returns Promise with basic DOI information or null
|
||||||
|
*/
|
||||||
|
private async getDoiInfoFromMds(doiValue: string): Promise<any | null> {
|
||||||
|
try {
|
||||||
|
const auth = {
|
||||||
|
username: this.username,
|
||||||
|
password: this.password,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Get DOI URL
|
||||||
|
const doiResponse = await axios.get(`${this.serviceUrl}/doi/${doiValue}`, { auth });
|
||||||
|
|
||||||
|
if (doiResponse.status === 200) {
|
||||||
|
// Get metadata if available
|
||||||
|
try {
|
||||||
|
const metadataResponse = await axios.get(`${this.serviceUrl}/metadata/${doiValue}`, {
|
||||||
|
auth,
|
||||||
|
headers: {
|
||||||
|
Accept: 'application/xml',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: doiResponse.data.trim(),
|
||||||
|
metadata: metadataResponse.data,
|
||||||
|
created: new Date().toISOString(), // MDS doesn't provide creation dates
|
||||||
|
registered: new Date().toISOString(), // Use current time as fallback
|
||||||
|
source: 'mds',
|
||||||
|
};
|
||||||
|
} catch (metadataError) {
|
||||||
|
// Return basic info even if metadata fetch fails
|
||||||
|
return {
|
||||||
|
url: doiResponse.data.trim(),
|
||||||
|
created: new Date().toISOString(),
|
||||||
|
registered: new Date().toISOString(),
|
||||||
|
source: 'mds',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
if (error.response?.status === 404) {
|
||||||
|
logger.debug(`DOI ${doiValue} not found in DataCite MDS`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug(`DataCite MDS API failed for ${doiValue}: ${error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if a DOI exists in DataCite
|
||||||
|
*
|
||||||
|
* @param doiValue The DOI identifier
|
||||||
|
* @returns Promise<boolean> True if DOI exists
|
||||||
|
*/
|
||||||
|
public async doiExists(doiValue: string): Promise<boolean> {
|
||||||
|
const doiInfo = await this.getDoiInfo(doiValue);
|
||||||
|
return doiInfo !== null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the last modification date of a DOI
|
||||||
|
*
|
||||||
|
* @param doiValue The DOI identifier
|
||||||
|
* @returns Promise<Date | null> Last modification date or creation date if never updated, null if not found
|
||||||
|
*/
|
||||||
|
public async getDoiLastModified(doiValue: string): Promise<Date | null> {
|
||||||
|
const doiInfo = await this.getDoiInfo(doiValue);
|
||||||
|
|
||||||
|
if (doiInfo) {
|
||||||
|
// Use updated date if available, otherwise fall back to created/registered date
|
||||||
|
const dateToUse = doiInfo.updated || doiInfo.registered || doiInfo.created;
|
||||||
|
|
||||||
|
if (dateToUse) {
|
||||||
|
logger.debug(
|
||||||
|
`DOI ${doiValue}: Using ${doiInfo.updated ? 'updated' : doiInfo.registered ? 'registered' : 'created'} date: ${dateToUse}`,
|
||||||
|
);
|
||||||
|
return new Date(dateToUse);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Makes a DOI unfindable (registered but not discoverable)
|
||||||
|
* Note: DOIs cannot be deleted, only made unfindable
|
||||||
|
* await doiClient.makeDoiUnfindable('10.21388/tethys.231');
|
||||||
|
*
|
||||||
|
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
|
||||||
|
* @returns Promise<AxiosResponse<any>> The http response
|
||||||
|
*/
|
||||||
|
public async makeDoiUnfindable(doiValue: string): Promise<AxiosResponse<any>> {
|
||||||
|
const auth = {
|
||||||
|
username: this.username,
|
||||||
|
password: this.password,
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
// First, check if DOI exists
|
||||||
|
const exists = await this.doiExists(doiValue);
|
||||||
|
if (!exists) {
|
||||||
|
throw new DoiClientException(404, `DOI ${doiValue} not found`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete the DOI URL mapping to make it unfindable
|
||||||
|
// This removes the URL but keeps the metadata registered
|
||||||
|
const response = await axios.delete(`${this.serviceUrl}/doi/${doiValue}`, { auth });
|
||||||
|
|
||||||
|
// Response Codes for DELETE /doi/{doi}
|
||||||
|
// 200 OK: operation successful
|
||||||
|
// 401 Unauthorized: no login
|
||||||
|
// 403 Forbidden: login problem, quota exceeded
|
||||||
|
// 404 Not Found: DOI does not exist
|
||||||
|
if (response.status !== 200) {
|
||||||
|
const message = `Unexpected DataCite MDS response code ${response.status}`;
|
||||||
|
logger.error(message);
|
||||||
|
throw new DoiClientException(response.status, message);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`DOI ${doiValue} successfully made unfindable`);
|
||||||
|
return response;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Failed to make DOI ${doiValue} unfindable: ${error.message}`);
|
||||||
|
if (error instanceof DoiClientException) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
throw new DoiClientException(error.response?.status || 500, error.response?.data || error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Makes a DOI findable again by re-registering the URL
|
||||||
|
* await doiClient.makeDoiFindable(
|
||||||
|
* '10.21388/tethys.231',
|
||||||
|
* 'https://doi.dev.tethys.at/10.21388/tethys.231'
|
||||||
|
* );
|
||||||
|
*
|
||||||
|
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
|
||||||
|
* @param landingPageUrl The landing page URL
|
||||||
|
* @returns Promise<AxiosResponse<any>> The http response
|
||||||
|
*/
|
||||||
|
public async makeDoiFindable(doiValue: string, landingPageUrl: string): Promise<AxiosResponse<any>> {
|
||||||
|
const auth = {
|
||||||
|
username: this.username,
|
||||||
|
password: this.password,
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Re-register the DOI with its URL to make it findable again
|
||||||
|
const response = await axios.put(`${this.serviceUrl}/doi/${doiValue}`, `doi=${doiValue}\nurl=${landingPageUrl}`, { auth });
|
||||||
|
|
||||||
|
// Response Codes for PUT /doi/{doi}
|
||||||
|
// 201 Created: operation successful
|
||||||
|
// 400 Bad Request: request body must be exactly two lines: DOI and URL
|
||||||
|
// 401 Unauthorized: no login
|
||||||
|
// 403 Forbidden: login problem, quota exceeded
|
||||||
|
// 412 Precondition failed: metadata must be uploaded first
|
||||||
|
if (response.status !== 201) {
|
||||||
|
const message = `Unexpected DataCite MDS response code ${response.status}`;
|
||||||
|
logger.error(message);
|
||||||
|
throw new DoiClientException(response.status, message);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`DOI ${doiValue} successfully made findable again`);
|
||||||
|
return response;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Failed to make DOI ${doiValue} findable: ${error.message}`);
|
||||||
|
if (error instanceof DoiClientException) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
throw new DoiClientException(error.response?.status || 500, error.response?.data || error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the current state of a DOI (draft, registered, findable)
|
||||||
|
* const state = await doiClient.getDoiState('10.21388/tethys.231');
|
||||||
|
* console.log(`Current state: ${state}`); // 'findable'
|
||||||
|
*
|
||||||
|
* @param doiValue The DOI identifier
|
||||||
|
* @returns Promise<string | null> The DOI state or null if not found
|
||||||
|
*/
|
||||||
|
public async getDoiState(doiValue: string): Promise<string | null> {
|
||||||
|
const doiInfo = await this.getDoiInfo(doiValue);
|
||||||
|
return doiInfo?.state || null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
317
commands/fix_dataset_cross_references.ts
Normal file
317
commands/fix_dataset_cross_references.ts
Normal file
|
|
@ -0,0 +1,317 @@
|
||||||
|
/*
|
||||||
|
|--------------------------------------------------------------------------
|
||||||
|
| node ace make:command fix-dataset-cross-references
|
||||||
|
| DONE: create commands/fix_dataset_cross_references.ts
|
||||||
|
|--------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
||||||
|
import type { CommandOptions } from '@adonisjs/core/types/ace';
|
||||||
|
import Dataset from '#models/dataset';
|
||||||
|
import DatasetReference from '#models/dataset_reference';
|
||||||
|
// import env from '#start/env';
|
||||||
|
|
||||||
|
interface MissingCrossReference {
|
||||||
|
sourceDatasetId: number;
|
||||||
|
targetDatasetId: number;
|
||||||
|
sourcePublishId: number | null;
|
||||||
|
targetPublishId: number | null;
|
||||||
|
referenceType: string;
|
||||||
|
relation: string;
|
||||||
|
doi: string | null;
|
||||||
|
reverseRelation: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export default class DetectMissingCrossReferences extends BaseCommand {
|
||||||
|
static commandName = 'detect:missing-cross-references';
|
||||||
|
static description = 'Detect missing bidirectional cross-references between versioned datasets';
|
||||||
|
|
||||||
|
public static needsApplication = true;
|
||||||
|
|
||||||
|
@flags.boolean({ alias: 'f', description: 'Fix missing cross-references automatically' })
|
||||||
|
public fix: boolean = false;
|
||||||
|
|
||||||
|
@flags.boolean({ alias: 'v', description: 'Verbose output' })
|
||||||
|
public verbose: boolean = false;
|
||||||
|
|
||||||
|
public static options: CommandOptions = {
|
||||||
|
startApp: true,
|
||||||
|
staysAlive: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
async run() {
|
||||||
|
this.logger.info('🔍 Detecting missing cross-references...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const missingReferences = await this.findMissingCrossReferences();
|
||||||
|
|
||||||
|
if (missingReferences.length === 0) {
|
||||||
|
this.logger.success('All cross-references are properly linked!');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s):`);
|
||||||
|
|
||||||
|
for (const missing of missingReferences) {
|
||||||
|
this.logger.info(
|
||||||
|
`Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (this.verbose) {
|
||||||
|
this.logger.info(` - Reference type: ${missing.referenceType}`);
|
||||||
|
this.logger.info(` - Relation: ${missing.relation}`);
|
||||||
|
this.logger.info(` - DOI: ${missing.doi}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.fix) {
|
||||||
|
await this.fixMissingReferences(missingReferences);
|
||||||
|
this.logger.success('All missing cross-references have been fixed!');
|
||||||
|
} else {
|
||||||
|
this.printMissingReferencesList(missingReferences);
|
||||||
|
this.logger.info('💡 Run with --fix flag to automatically create missing cross-references');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.error('Error detecting missing cross-references:', error);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private async findMissingCrossReferences(): Promise<MissingCrossReference[]> {
|
||||||
|
const missingReferences: {
|
||||||
|
sourceDatasetId: number;
|
||||||
|
targetDatasetId: number;
|
||||||
|
sourcePublishId: number | null;
|
||||||
|
targetPublishId: number | null;
|
||||||
|
referenceType: string;
|
||||||
|
relation: string;
|
||||||
|
doi: string | null;
|
||||||
|
reverseRelation: string;
|
||||||
|
}[] = [];
|
||||||
|
|
||||||
|
this.logger.info('📊 Querying dataset references...');
|
||||||
|
|
||||||
|
// Find all references that point to Tethys datasets (DOI or URL containing tethys DOI)
|
||||||
|
// Only from datasets that are published
|
||||||
|
const tethysReferences = await DatasetReference.query()
|
||||||
|
.whereIn('type', ['DOI', 'URL'])
|
||||||
|
.where((query) => {
|
||||||
|
query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
|
||||||
|
})
|
||||||
|
.preload('dataset', (datasetQuery) => {
|
||||||
|
datasetQuery.where('server_state', 'published');
|
||||||
|
})
|
||||||
|
.whereHas('dataset', (datasetQuery) => {
|
||||||
|
datasetQuery.where('server_state', 'published');
|
||||||
|
});
|
||||||
|
|
||||||
|
this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets`);
|
||||||
|
|
||||||
|
let processedCount = 0;
|
||||||
|
for (const reference of tethysReferences) {
|
||||||
|
processedCount++;
|
||||||
|
|
||||||
|
if (this.verbose && processedCount % 10 === 0) {
|
||||||
|
this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract dataset publish_id from DOI or URL
|
||||||
|
const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value);
|
||||||
|
|
||||||
|
if (!targetDatasetPublish) {
|
||||||
|
if (this.verbose) {
|
||||||
|
this.logger.warning(`⚠️ Could not extract publish ID from: ${reference.value}`);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if target dataset exists and is published
|
||||||
|
const targetDataset = await Dataset.query()
|
||||||
|
.where('publish_id', targetDatasetPublish)
|
||||||
|
.where('server_state', 'published')
|
||||||
|
.first();
|
||||||
|
|
||||||
|
if (!targetDataset) {
|
||||||
|
if (this.verbose) {
|
||||||
|
this.logger.warning(`⚠️ Target dataset with publish_id ${targetDatasetPublish} not found or not published`);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure we have a valid source dataset with proper preloading
|
||||||
|
if (!reference.dataset) {
|
||||||
|
this.logger.warning(`⚠️ Source dataset ${reference.document_id} not properly loaded, skipping...`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if reverse reference exists
|
||||||
|
const reverseReferenceExists = await this.checkReverseReferenceExists(
|
||||||
|
targetDataset.id,
|
||||||
|
reference.document_id,
|
||||||
|
reference.relation,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!reverseReferenceExists) {
|
||||||
|
missingReferences.push({
|
||||||
|
sourceDatasetId: reference.document_id,
|
||||||
|
targetDatasetId: targetDataset.id,
|
||||||
|
sourcePublishId: reference.dataset.publish_id || null,
|
||||||
|
targetPublishId: targetDataset.publish_id || null,
|
||||||
|
referenceType: reference.type,
|
||||||
|
relation: reference.relation,
|
||||||
|
doi: reference.value,
|
||||||
|
reverseRelation: this.getReverseRelation(reference.relation),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.info(`✅ Processed all ${processedCount} references`);
|
||||||
|
return missingReferences;
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractDatasetPublishIdFromReference(value: string): number | null {
|
||||||
|
// Extract from DOI: https://doi.org/10.24341/tethys.107 -> 107
|
||||||
|
const doiMatch = value.match(/10\.24341\/tethys\.(\d+)/);
|
||||||
|
if (doiMatch) {
|
||||||
|
return parseInt(doiMatch[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract from URL: https://tethys.at/dataset/107 -> 107
|
||||||
|
const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/);
|
||||||
|
if (urlMatch) {
|
||||||
|
return parseInt(urlMatch[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async checkReverseReferenceExists(
|
||||||
|
sourceDatasetId: number,
|
||||||
|
targetDatasetId: number,
|
||||||
|
originalRelation: string,
|
||||||
|
): Promise<boolean> {
|
||||||
|
const reverseRelation = this.getReverseRelation(originalRelation);
|
||||||
|
|
||||||
|
// Only check for reverse references where the source dataset is also published
|
||||||
|
const reverseReference = await DatasetReference.query()
|
||||||
|
.where('document_id', sourceDatasetId)
|
||||||
|
.where('related_document_id', targetDatasetId)
|
||||||
|
.where('relation', reverseRelation)
|
||||||
|
.whereHas('dataset', (datasetQuery) => {
|
||||||
|
datasetQuery.where('server_state', 'published');
|
||||||
|
})
|
||||||
|
.first();
|
||||||
|
|
||||||
|
return !!reverseReference;
|
||||||
|
}
|
||||||
|
|
||||||
|
private getReverseRelation(relation: string): string {
|
||||||
|
const relationMap: Record<string, string> = {
|
||||||
|
IsNewVersionOf: 'IsPreviousVersionOf',
|
||||||
|
IsPreviousVersionOf: 'IsNewVersionOf',
|
||||||
|
|
||||||
|
IsVersionOf: 'HasVersion',
|
||||||
|
HasVersion: 'IsVersionOf',
|
||||||
|
|
||||||
|
Compiles: 'IsCompiledBy',
|
||||||
|
IsCompiledBy: 'Compiles',
|
||||||
|
|
||||||
|
IsVariantFormOf: 'IsOriginalFormOf',
|
||||||
|
IsOriginalFormOf: 'IsVariantFormOf',
|
||||||
|
|
||||||
|
IsPartOf: 'HasPart',
|
||||||
|
HasPart: 'IsPartOf',
|
||||||
|
|
||||||
|
IsSupplementTo: 'IsSupplementedBy',
|
||||||
|
IsSupplementedBy: 'IsSupplementTo',
|
||||||
|
|
||||||
|
Continues: 'IsContinuedBy',
|
||||||
|
IsContinuedBy: 'Continues',
|
||||||
|
};
|
||||||
|
|
||||||
|
// to catch relation types like 'compiles' or 'IsVariantFormOf' that are not in the map mark reverse as 'HasVersion'
|
||||||
|
return relationMap[relation] || 'HasVersion'; // Default fallback
|
||||||
|
}
|
||||||
|
|
||||||
|
private printMissingReferencesList(missingReferences: MissingCrossReference[]) {
|
||||||
|
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
|
||||||
|
console.log('│ MISSING CROSS-REFERENCES REPORT │');
|
||||||
|
console.log('│ (Published Datasets Only) │');
|
||||||
|
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
|
||||||
|
console.log();
|
||||||
|
|
||||||
|
missingReferences.forEach((missing, index) => {
|
||||||
|
console.log(
|
||||||
|
`${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}) → Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId})`,
|
||||||
|
);
|
||||||
|
console.log(` ├─ Current relation: "${missing.relation}"`);
|
||||||
|
console.log(` ├─ Missing reverse relation: "${missing.reverseRelation}"`);
|
||||||
|
console.log(` ├─ Reference type: ${missing.referenceType}`);
|
||||||
|
console.log(` └─ DOI/URL: ${missing.doi}`);
|
||||||
|
console.log();
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
|
||||||
|
console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected │`);
|
||||||
|
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
|
||||||
|
}
|
||||||
|
|
||||||
|
private async fixMissingReferences(missingReferences: MissingCrossReference[]) {
|
||||||
|
this.logger.info('🔧 Creating missing cross-references in database...');
|
||||||
|
|
||||||
|
let fixedCount = 0;
|
||||||
|
let errorCount = 0;
|
||||||
|
|
||||||
|
for (const [index, missing] of missingReferences.entries()) {
|
||||||
|
try {
|
||||||
|
// Get the source dataset to create proper reference - ensure it's published
|
||||||
|
const sourceDataset = await Dataset.query()
|
||||||
|
.where('id', missing.sourceDatasetId)
|
||||||
|
.where('server_state', 'published')
|
||||||
|
.preload('identifier')
|
||||||
|
.first();
|
||||||
|
|
||||||
|
if (!sourceDataset) {
|
||||||
|
this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`);
|
||||||
|
errorCount++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the reverse reference
|
||||||
|
const reverseReference = new DatasetReference();
|
||||||
|
reverseReference.document_id = missing.targetDatasetId;
|
||||||
|
reverseReference.related_document_id = missing.sourceDatasetId;
|
||||||
|
reverseReference.type = 'DOI';
|
||||||
|
reverseReference.relation = missing.reverseRelation;
|
||||||
|
|
||||||
|
// Use the source dataset's DOI for the value
|
||||||
|
if (sourceDataset.identifier?.value) {
|
||||||
|
reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`;
|
||||||
|
} else {
|
||||||
|
// Fallback to dataset URL if no DOI
|
||||||
|
reverseReference.value = `https://tethys.at/dataset/${sourceDataset.publish_id || missing.sourceDatasetId}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the source dataset's main title for the label
|
||||||
|
reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`;
|
||||||
|
|
||||||
|
await reverseReference.save();
|
||||||
|
fixedCount++;
|
||||||
|
|
||||||
|
if (this.verbose) {
|
||||||
|
this.logger.info(
|
||||||
|
`✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.targetDatasetId} -> ${missing.sourceDatasetId}`,
|
||||||
|
);
|
||||||
|
} else if ((index + 1) % 10 === 0) {
|
||||||
|
this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.error(
|
||||||
|
`❌ Error creating reverse reference for datasets ${missing.targetDatasetId} -> ${missing.sourceDatasetId}:`,
|
||||||
|
error,
|
||||||
|
);
|
||||||
|
errorCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.info(`📊 Fix completed: ${fixedCount} created, ${errorCount} errors`);
|
||||||
|
}
|
||||||
|
}
|
||||||
271
commands/update_datacite.ts
Normal file
271
commands/update_datacite.ts
Normal file
|
|
@ -0,0 +1,271 @@
|
||||||
|
/*
|
||||||
|
|--------------------------------------------------------------------------
|
||||||
|
| node ace make:command update-datacite
|
||||||
|
| DONE: create commands/update_datacite.ts
|
||||||
|
|--------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
import { BaseCommand, flags } from '@adonisjs/core/ace';
|
||||||
|
import { CommandOptions } from '@adonisjs/core/types/ace';
|
||||||
|
import Dataset from '#models/dataset';
|
||||||
|
import { DoiClient } from '#app/Library/Doi/DoiClient';
|
||||||
|
import DoiClientException from '#app/exceptions/DoiClientException';
|
||||||
|
import Index from '#app/Library/Utils/Index';
|
||||||
|
import env from '#start/env';
|
||||||
|
import logger from '@adonisjs/core/services/logger';
|
||||||
|
import { DateTime } from 'luxon';
|
||||||
|
import { getDomain } from '#app/utils/utility-functions';
|
||||||
|
|
||||||
|
export default class UpdateDatacite extends BaseCommand {
|
||||||
|
static commandName = 'update:datacite';
|
||||||
|
static description = 'Update DataCite DOI records for published datasets';
|
||||||
|
|
||||||
|
public static needsApplication = true;
|
||||||
|
|
||||||
|
@flags.number({ alias: 'p', description: 'Specific publish_id to update' })
|
||||||
|
public publish_id: number;
|
||||||
|
|
||||||
|
@flags.boolean({ alias: 'f', description: 'Force update all records regardless of modification date' })
|
||||||
|
public force: boolean = false;
|
||||||
|
|
||||||
|
@flags.boolean({ alias: 'd', description: 'Dry run - show what would be updated without making changes' })
|
||||||
|
public dryRun: boolean = false;
|
||||||
|
|
||||||
|
@flags.boolean({ alias: 's', description: 'Show detailed stats for each dataset that needs updating' })
|
||||||
|
public stats: boolean = false;
|
||||||
|
|
||||||
|
//example: node ace update:datacite -p 123 --force --dry-run
|
||||||
|
|
||||||
|
public static options: CommandOptions = {
|
||||||
|
startApp: true, // Whether to boot the application before running the command
|
||||||
|
stayAlive: false, // Whether to keep the process alive after the command has executed
|
||||||
|
};
|
||||||
|
|
||||||
|
async run() {
|
||||||
|
logger.info('Starting DataCite update process...');
|
||||||
|
|
||||||
|
const prefix = env.get('DATACITE_PREFIX', '');
|
||||||
|
const base_domain = env.get('BASE_DOMAIN', '');
|
||||||
|
const apiUrl = env.get('DATACITE_API_URL', 'https://api.datacite.org');
|
||||||
|
|
||||||
|
if (!prefix || !base_domain) {
|
||||||
|
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`Using DataCite API: ${apiUrl}`);
|
||||||
|
|
||||||
|
const datasets = await this.getDatasets();
|
||||||
|
logger.info(`Found ${datasets.length} datasets to process`);
|
||||||
|
|
||||||
|
let updated = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
let errors = 0;
|
||||||
|
|
||||||
|
for (const dataset of datasets) {
|
||||||
|
try {
|
||||||
|
const shouldUpdate = this.force || (await this.shouldUpdateDataset(dataset));
|
||||||
|
|
||||||
|
if (this.stats) {
|
||||||
|
// Stats mode: show detailed information for datasets that need updating
|
||||||
|
if (shouldUpdate) {
|
||||||
|
await this.showDatasetStats(dataset);
|
||||||
|
updated++;
|
||||||
|
} else {
|
||||||
|
skipped++;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!shouldUpdate) {
|
||||||
|
logger.info(`Dataset ${dataset.publish_id}: Up to date, skipping`);
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.dryRun) {
|
||||||
|
logger.info(`Dataset ${dataset.publish_id}: Would update DataCite record (dry run)`);
|
||||||
|
updated++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.updateDataciteRecord(dataset, prefix, base_domain);
|
||||||
|
logger.info(`Dataset ${dataset.publish_id}: Successfully updated DataCite record`);
|
||||||
|
updated++;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Dataset ${dataset.publish_id}: Failed to update - ${error.message}`);
|
||||||
|
errors++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.stats) {
|
||||||
|
logger.info(`\nDataCite Stats Summary: ${updated} datasets need updating, ${skipped} are up to date`);
|
||||||
|
} else {
|
||||||
|
logger.info(`DataCite update completed. Updated: ${updated}, Skipped: ${skipped}, Errors: ${errors}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async getDatasets(): Promise<Dataset[]> {
|
||||||
|
const query = Dataset.query()
|
||||||
|
.preload('identifier')
|
||||||
|
.preload('xmlCache')
|
||||||
|
.where('server_state', 'published')
|
||||||
|
.whereHas('identifier', (identifierQuery) => {
|
||||||
|
identifierQuery.where('type', 'doi');
|
||||||
|
});
|
||||||
|
|
||||||
|
if (this.publish_id) {
|
||||||
|
query.where('publish_id', this.publish_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return await query.exec();
|
||||||
|
}
|
||||||
|
|
||||||
|
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
// Check if dataset has a DOI identifier (HasOne relationship)
|
||||||
|
let doiIdentifier = dataset.identifier;
|
||||||
|
|
||||||
|
if (!doiIdentifier) {
|
||||||
|
// Try to load the relationship if not already loaded
|
||||||
|
await dataset.load('identifier');
|
||||||
|
doiIdentifier = dataset.identifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
||||||
|
logger.warn(`Dataset ${dataset.publish_id}: No DOI identifier found`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate dataset modification date
|
||||||
|
const datasetModified = dataset.server_date_modified;
|
||||||
|
const now = DateTime.now();
|
||||||
|
|
||||||
|
if (!datasetModified) {
|
||||||
|
logger.error(`Dataset ${dataset.publish_id}: server_date_modified is null or undefined`);
|
||||||
|
return true; // Update anyway if modification date is missing
|
||||||
|
}
|
||||||
|
|
||||||
|
if (datasetModified > now) {
|
||||||
|
logger.error(
|
||||||
|
`Dataset ${dataset.publish_id}: server_date_modified (${datasetModified.toISO()}) is in the future! ` +
|
||||||
|
`Current time: ${now.toISO()}. This indicates a data integrity issue. Skipping update.`,
|
||||||
|
);
|
||||||
|
return false; // Do not update when modification date is invalid
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get DOI information from DataCite using DoiClient
|
||||||
|
const doiClient = new DoiClient();
|
||||||
|
const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);
|
||||||
|
|
||||||
|
if (!doiLastModified) {
|
||||||
|
logger.warn(`Dataset ${dataset.publish_id}: Could not retrieve DOI modification date from DataCite`);
|
||||||
|
return true; // Update anyway if we can't get DOI info
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare dataset modification date with DOI modification date
|
||||||
|
const doiModified = DateTime.fromJSDate(doiLastModified);
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
`Dataset ${dataset.publish_id}: Dataset modified: ${datasetModified.toISO()}, DOI modified: ${doiModified.toISO()}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update if dataset was modified after the DOI record
|
||||||
|
return datasetModified > doiModified;
|
||||||
|
} catch (error) {
|
||||||
|
logger.warn(`Error checking update status for dataset ${dataset.publish_id}: ${error.message}`);
|
||||||
|
return true; // Update anyway if we can't determine status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async updateDataciteRecord(dataset: Dataset, prefix: string, base_domain: string): Promise<void> {
|
||||||
|
try {
|
||||||
|
// Get the DOI identifier (HasOne relationship)
|
||||||
|
let doiIdentifier = dataset.identifier;
|
||||||
|
|
||||||
|
if (!doiIdentifier) {
|
||||||
|
await dataset.load('identifier');
|
||||||
|
doiIdentifier = dataset.identifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
|
||||||
|
throw new Error('No DOI identifier found for dataset');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate XML metadata
|
||||||
|
const xmlMeta = (await Index.getDoiRegisterString(dataset)) as string;
|
||||||
|
if (!xmlMeta) {
|
||||||
|
throw new Error('Failed to generate XML metadata');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Construct DOI value and landing page URL
|
||||||
|
const doiValue = doiIdentifier.value; // Use existing DOI value
|
||||||
|
const landingPageUrl = `https://doi.${getDomain(base_domain)}/${doiValue}`;
|
||||||
|
|
||||||
|
// Update DataCite record
|
||||||
|
const doiClient = new DoiClient();
|
||||||
|
const dataciteResponse = await doiClient.registerDoi(doiValue, xmlMeta, landingPageUrl);
|
||||||
|
|
||||||
|
if (dataciteResponse?.status === 201) {
|
||||||
|
// // Update dataset modification date
|
||||||
|
// dataset.server_date_modified = DateTime.now();
|
||||||
|
// await dataset.save();
|
||||||
|
|
||||||
|
// // Update search index
|
||||||
|
// const index_name = 'tethys-records';
|
||||||
|
// await Index.indexDocument(dataset, index_name);
|
||||||
|
|
||||||
|
logger.debug(`Dataset ${dataset.publish_id}: DataCite record and search index updated successfully`);
|
||||||
|
} else {
|
||||||
|
throw new DoiClientException(
|
||||||
|
dataciteResponse?.status || 500,
|
||||||
|
`Unexpected DataCite response code: ${dataciteResponse?.status}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof DoiClientException) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
throw new Error(`Failed to update DataCite record: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shows detailed statistics for a dataset that needs updating
|
||||||
|
*/
|
||||||
|
private async showDatasetStats(dataset: Dataset): Promise<void> {
|
||||||
|
try {
|
||||||
|
let doiIdentifier = dataset.identifier;
|
||||||
|
|
||||||
|
if (!doiIdentifier) {
|
||||||
|
await dataset.load('identifier');
|
||||||
|
doiIdentifier = dataset.identifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
const doiValue = doiIdentifier?.value || 'N/A';
|
||||||
|
const doiStatus = doiIdentifier?.status || 'N/A';
|
||||||
|
const datasetModified = dataset.server_date_modified;
|
||||||
|
|
||||||
|
// Get DOI info from DataCite
|
||||||
|
const doiClient = new DoiClient();
|
||||||
|
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
|
||||||
|
const doiState = await doiClient.getDoiState(doiValue);
|
||||||
|
|
||||||
|
console.log(`
|
||||||
|
┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────
|
||||||
|
│ DOI Value: ${doiValue}
|
||||||
|
│ DOI Status (DB): ${doiStatus}
|
||||||
|
│ DOI State (DataCite): ${doiState || 'Unknown'}
|
||||||
|
│ Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}
|
||||||
|
│ DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}
|
||||||
|
│ Needs Update: YES - Dataset newer than DOI
|
||||||
|
└─────────────────────────────────────────────────────────────────────────────────────────────`);
|
||||||
|
} catch (error) {
|
||||||
|
console.log(`
|
||||||
|
┌─ Dataset ${dataset.publish_id} ───────────────────────────────────────────────────────────────
|
||||||
|
│ DOI Value: ${dataset.identifier?.value || 'N/A'}
|
||||||
|
│ Error: ${error.message}
|
||||||
|
│ Needs Update: YES - Error checking status
|
||||||
|
└─────────────────────────────────────────────────────────────────────────────────────────────`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
278
docs/commands/index-datasets.md
Normal file
278
docs/commands/index-datasets.md
Normal file
|
|
@ -0,0 +1,278 @@
|
||||||
|
# Dataset Indexing Command
|
||||||
|
|
||||||
|
AdonisJS Ace command for indexing and synchronizing published datasets with OpenSearch for search functionality.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The `index:datasets` command processes published datasets and creates/updates corresponding search index documents in OpenSearch. It intelligently compares modification timestamps to only re-index datasets when necessary, optimizing performance while maintaining search index accuracy.
|
||||||
|
|
||||||
|
## Command Syntax
|
||||||
|
|
||||||
|
```bash
|
||||||
|
node ace index:datasets [options]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
| Flag | Alias | Description |
|
||||||
|
|------|-------|-------------|
|
||||||
|
| `--publish_id <number>` | `-p` | Index a specific dataset by publish_id |
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### Basic Operations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Index all published datasets that have been modified since last indexing
|
||||||
|
node ace index:datasets
|
||||||
|
|
||||||
|
# Index a specific dataset by publish_id
|
||||||
|
node ace index:datasets --publish_id 231
|
||||||
|
node ace index:datasets -p 231
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### 1. **Dataset Selection**
|
||||||
|
The command processes datasets that meet these criteria:
|
||||||
|
- `server_state = 'published'` - Only published datasets
|
||||||
|
- Has preloaded `xmlCache` relationship for metadata transformation
|
||||||
|
- Optionally filtered by specific `publish_id`
|
||||||
|
|
||||||
|
### 2. **Smart Update Detection**
|
||||||
|
For each dataset, the command:
|
||||||
|
- Checks if the dataset exists in the OpenSearch index
|
||||||
|
- Compares `server_date_modified` timestamps
|
||||||
|
- Only re-indexes if the dataset is newer than the indexed version
|
||||||
|
|
||||||
|
### 3. **Document Processing**
|
||||||
|
The indexing process involves:
|
||||||
|
1. **XML Generation**: Creates structured XML from dataset metadata
|
||||||
|
2. **XSLT Transformation**: Converts XML to JSON using Saxon-JS processor
|
||||||
|
3. **Index Update**: Updates or creates the document in OpenSearch
|
||||||
|
4. **Logging**: Records success/failure for each operation
|
||||||
|
|
||||||
|
## Index Structure
|
||||||
|
|
||||||
|
### Index Configuration
|
||||||
|
- **Index Name**: `tethys-records`
|
||||||
|
- **Document ID**: Dataset `publish_id`
|
||||||
|
- **Refresh**: `true` (immediate availability)
|
||||||
|
|
||||||
|
### Document Fields
|
||||||
|
The indexed documents contain:
|
||||||
|
- **Metadata Fields**: Title, description, authors, keywords
|
||||||
|
- **Identifiers**: DOI, publish_id, and other identifiers
|
||||||
|
- **Temporal Data**: Publication dates, coverage periods
|
||||||
|
- **Geographic Data**: Spatial coverage information
|
||||||
|
- **Technical Details**: Data formats, access information
|
||||||
|
- **Timestamps**: Creation and modification dates
|
||||||
|
|
||||||
|
## Example Output
|
||||||
|
|
||||||
|
### Successful Run
|
||||||
|
```bash
|
||||||
|
node ace index:datasets
|
||||||
|
```
|
||||||
|
```
|
||||||
|
Found 150 published datasets to process
|
||||||
|
Dataset with publish_id 231 successfully indexed
|
||||||
|
Dataset with publish_id 245 is up to date, skipping indexing
|
||||||
|
Dataset with publish_id 267 successfully indexed
|
||||||
|
An error occurred while indexing dataset with publish_id 289. Error: Invalid XML metadata
|
||||||
|
Processing completed: 148 indexed, 1 skipped, 1 error
|
||||||
|
```
|
||||||
|
|
||||||
|
### Specific Dataset
|
||||||
|
```bash
|
||||||
|
node ace index:datasets --publish_id 231
|
||||||
|
```
|
||||||
|
```
|
||||||
|
Found 1 published dataset to process
|
||||||
|
Dataset with publish_id 231 successfully indexed
|
||||||
|
Processing completed: 1 indexed, 0 skipped, 0 errors
|
||||||
|
```
|
||||||
|
|
||||||
|
## Update Logic
|
||||||
|
|
||||||
|
The command uses intelligent indexing to avoid unnecessary processing:
|
||||||
|
|
||||||
|
| Condition | Action | Reason |
|
||||||
|
|-----------|--------|--------|
|
||||||
|
| Dataset not in index | ✅ Index | New dataset needs indexing |
|
||||||
|
| Dataset newer than indexed version | ✅ Re-index | Dataset has been updated |
|
||||||
|
| Dataset same/older than indexed version | ❌ Skip | Already up to date |
|
||||||
|
| OpenSearch document check fails | ✅ Index | Better safe than sorry |
|
||||||
|
| Invalid XML metadata | ❌ Skip + Log Error | Cannot process invalid data |
|
||||||
|
|
||||||
|
### Timestamp Comparison
|
||||||
|
```typescript
|
||||||
|
// Example comparison logic
|
||||||
|
const existingModified = DateTime.fromMillis(Number(existingDoc.server_date_modified) * 1000);
|
||||||
|
const currentModified = dataset.server_date_modified;
|
||||||
|
|
||||||
|
if (currentModified <= existingModified) {
|
||||||
|
// Skip - already up to date
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Proceed with indexing
|
||||||
|
```
|
||||||
|
|
||||||
|
## XML Transformation Process
|
||||||
|
|
||||||
|
### 1. **XML Generation**
|
||||||
|
```xml
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="true"?>
|
||||||
|
<root>
|
||||||
|
<Dataset>
|
||||||
|
<!-- Dataset metadata fields -->
|
||||||
|
<title>Research Dataset Title</title>
|
||||||
|
<description>Dataset description...</description>
|
||||||
|
<!-- Additional metadata -->
|
||||||
|
</Dataset>
|
||||||
|
</root>
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. **XSLT Processing**
|
||||||
|
The command uses Saxon-JS with a compiled stylesheet (`solr.sef.json`) to transform XML to JSON:
|
||||||
|
```javascript
|
||||||
|
const result = await SaxonJS.transform({
|
||||||
|
stylesheetText: proc,
|
||||||
|
destination: 'serialized',
|
||||||
|
sourceText: xmlString,
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. **Final JSON Document**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "231",
|
||||||
|
"title": "Research Dataset Title",
|
||||||
|
"description": "Dataset description...",
|
||||||
|
"authors": ["Author Name"],
|
||||||
|
"server_date_modified": 1634567890,
|
||||||
|
"publish_id": 231
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration Requirements
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
```bash
|
||||||
|
# OpenSearch Configuration
|
||||||
|
OPENSEARCH_HOST=localhost:9200
|
||||||
|
|
||||||
|
# For production:
|
||||||
|
# OPENSEARCH_HOST=your-opensearch-cluster:9200
|
||||||
|
```
|
||||||
|
|
||||||
|
### Required Files
|
||||||
|
- **XSLT Stylesheet**: `public/assets2/solr.sef.json` - Compiled Saxon-JS stylesheet for XML transformation
|
||||||
|
|
||||||
|
### Database Relationships
|
||||||
|
The command expects these model relationships:
|
||||||
|
```typescript
|
||||||
|
// Dataset model must have:
|
||||||
|
@hasOne(() => XmlCache, { foreignKey: 'dataset_id' })
|
||||||
|
public xmlCache: HasOne<typeof XmlCache>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
The command handles various error scenarios gracefully:
|
||||||
|
|
||||||
|
### Common Errors and Solutions
|
||||||
|
|
||||||
|
| Error | Cause | Solution |
|
||||||
|
|-------|-------|----------|
|
||||||
|
| `XSLT transformation failed` | Invalid XML or missing stylesheet | Check XML structure and stylesheet path |
|
||||||
|
| `OpenSearch connection error` | Service unavailable | Verify OpenSearch is running and accessible |
|
||||||
|
| `JSON parse error` | Malformed transformation result | Check XSLT stylesheet output format |
|
||||||
|
| `Missing xmlCache relationship` | Data integrity issue | Ensure xmlCache exists for dataset |
|
||||||
|
|
||||||
|
### Error Logging
|
||||||
|
```bash
|
||||||
|
# Typical error log entry
|
||||||
|
An error occurred while indexing dataset with publish_id 231.
|
||||||
|
Error: XSLT transformation failed: Invalid XML structure at line 15
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
### Batch Processing
|
||||||
|
- Processes datasets sequentially to avoid overwhelming OpenSearch
|
||||||
|
- Each dataset is committed individually for reliability
|
||||||
|
- Failed indexing of one dataset doesn't stop processing others
|
||||||
|
|
||||||
|
### Resource Usage
|
||||||
|
- **Memory**: XML/JSON transformations require temporary memory
|
||||||
|
- **Network**: OpenSearch API calls for each dataset
|
||||||
|
- **CPU**: XSLT transformations are CPU-intensive
|
||||||
|
|
||||||
|
### Optimization Tips
|
||||||
|
```bash
|
||||||
|
# Index only recently modified datasets (run regularly)
|
||||||
|
node ace index:datasets
|
||||||
|
|
||||||
|
# Index specific datasets when needed
|
||||||
|
node ace index:datasets --publish_id 231
|
||||||
|
|
||||||
|
# Consider running during off-peak hours for large batches
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration with Other Systems
|
||||||
|
|
||||||
|
### Search Functionality
|
||||||
|
The indexed documents power:
|
||||||
|
- **Dataset Search**: Full-text search across metadata
|
||||||
|
- **Faceted Browsing**: Filter by authors, keywords, dates
|
||||||
|
- **Geographic Search**: Spatial query capabilities
|
||||||
|
- **Auto-complete**: Suggest dataset titles and keywords
|
||||||
|
|
||||||
|
### Related Commands
|
||||||
|
- [`update:datacite`](update-datacite.md) - Often run after indexing to sync DOI metadata
|
||||||
|
- **Database migrations** - May require re-indexing after schema changes
|
||||||
|
|
||||||
|
### API Integration
|
||||||
|
The indexed data is consumed by:
|
||||||
|
- **Search API**: `/api/search` endpoints
|
||||||
|
- **Browse API**: `/api/datasets` with filtering
|
||||||
|
- **Recommendations**: Related dataset suggestions
|
||||||
|
|
||||||
|
## Monitoring and Maintenance
|
||||||
|
|
||||||
|
### Regular Tasks
|
||||||
|
```bash
|
||||||
|
# Daily indexing (recommended cron job)
|
||||||
|
0 2 * * * cd /path/to/project && node ace index:datasets
|
||||||
|
|
||||||
|
# Weekly full re-index (if needed)
|
||||||
|
0 3 * * 0 cd /path/to/project && node ace index:datasets --force
|
||||||
|
```
|
||||||
|
|
||||||
|
### Health Checks
|
||||||
|
- Monitor OpenSearch cluster health
|
||||||
|
- Check for failed indexing operations in logs
|
||||||
|
- Verify search functionality is working
|
||||||
|
- Compare dataset counts between database and index
|
||||||
|
|
||||||
|
### Troubleshooting
|
||||||
|
```bash
|
||||||
|
# Check specific dataset indexing
|
||||||
|
node ace index:datasets --publish_id 231
|
||||||
|
|
||||||
|
# Verify OpenSearch connectivity
|
||||||
|
curl -X GET "localhost:9200/_cluster/health"
|
||||||
|
|
||||||
|
# Check index statistics
|
||||||
|
curl -X GET "localhost:9200/tethys-records/_stats"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Regular Scheduling**: Run the command regularly (daily) to keep the search index current
|
||||||
|
2. **Monitor Logs**: Watch for transformation errors or OpenSearch issues
|
||||||
|
3. **Backup Strategy**: Include OpenSearch indices in backup procedures
|
||||||
|
4. **Resource Management**: Monitor OpenSearch cluster resources during bulk operations
|
||||||
|
5. **Testing**: Verify search functionality after major indexing operations
|
||||||
|
6. **Coordination**: Run indexing before DataCite updates when both are needed
|
||||||
216
docs/commands/update-datacite.md
Normal file
216
docs/commands/update-datacite.md
Normal file
|
|
@ -0,0 +1,216 @@
|
||||||
|
# DataCite Update Command
|
||||||
|
|
||||||
|
AdonisJS Ace command for updating DataCite DOI records for published datasets.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The `update:datacite` command synchronizes your local dataset metadata with DataCite DOI records. It intelligently compares modification dates to only update records when necessary, reducing unnecessary API calls and maintaining data consistency.
|
||||||
|
|
||||||
|
## Command Syntax
|
||||||
|
|
||||||
|
```bash
|
||||||
|
node ace update:datacite [options]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
| Flag | Alias | Description |
|
||||||
|
|------|-------|-------------|
|
||||||
|
| `--publish_id <number>` | `-p` | Update a specific dataset by publish_id |
|
||||||
|
| `--force` | `-f` | Force update all records regardless of modification date |
|
||||||
|
| `--dry-run` | `-d` | Preview what would be updated without making changes |
|
||||||
|
| `--stats` | `-s` | Show detailed statistics for datasets that need updating |
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### Basic Operations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Update all datasets that have been modified since their DOI was last updated
|
||||||
|
node ace update:datacite
|
||||||
|
|
||||||
|
# Update a specific dataset
|
||||||
|
node ace update:datacite --publish_id 231
|
||||||
|
node ace update:datacite -p 231
|
||||||
|
|
||||||
|
# Force update all datasets with DOIs (ignores modification dates)
|
||||||
|
node ace update:datacite --force
|
||||||
|
```
|
||||||
|
|
||||||
|
### Preview and Analysis
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Preview what would be updated (dry run)
|
||||||
|
node ace update:datacite --dry-run
|
||||||
|
|
||||||
|
# Show detailed statistics for datasets that need updating
|
||||||
|
node ace update:datacite --stats
|
||||||
|
|
||||||
|
# Show stats for a specific dataset
|
||||||
|
node ace update:datacite --stats --publish_id 231
|
||||||
|
```
|
||||||
|
|
||||||
|
### Combined Options
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Dry run for a specific dataset
|
||||||
|
node ace update:datacite --dry-run --publish_id 231
|
||||||
|
|
||||||
|
# Show stats for all datasets (including up-to-date ones)
|
||||||
|
node ace update:datacite --stats --force
|
||||||
|
```
|
||||||
|
|
||||||
|
## Command Modes
|
||||||
|
|
||||||
|
### 1. **Normal Mode** (Default)
|
||||||
|
Updates DataCite records for datasets that have been modified since their DOI was last updated.
|
||||||
|
|
||||||
|
**Example Output:**
|
||||||
|
```
|
||||||
|
Using DataCite API: https://api.test.datacite.org
|
||||||
|
Found 50 datasets to process
|
||||||
|
Dataset 231: Successfully updated DataCite record
|
||||||
|
Dataset 245: Up to date, skipping
|
||||||
|
Dataset 267: Successfully updated DataCite record
|
||||||
|
DataCite update completed. Updated: 15, Skipped: 35, Errors: 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. **Dry Run Mode** (`--dry-run`)
|
||||||
|
Shows what would be updated without making any changes to DataCite.
|
||||||
|
|
||||||
|
**Use Case:** Preview updates before running the actual command.
|
||||||
|
|
||||||
|
**Example Output:**
|
||||||
|
```
|
||||||
|
Dataset 231: Would update DataCite record (dry run)
|
||||||
|
Dataset 267: Would update DataCite record (dry run)
|
||||||
|
Dataset 245: Up to date, skipping
|
||||||
|
DataCite update completed. Updated: 2, Skipped: 1, Errors: 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. **Stats Mode** (`--stats`)
|
||||||
|
Shows detailed information for each dataset that needs updating, including why it needs updating.
|
||||||
|
|
||||||
|
**Use Case:** Debug synchronization issues, monitor dataset/DOI status, generate reports.
|
||||||
|
|
||||||
|
**Example Output:**
|
||||||
|
```
|
||||||
|
┌─ Dataset 231 ─────────────────────────────────────────────────────────
|
||||||
|
│ DOI Value: 10.21388/tethys.231
|
||||||
|
│ DOI Status (DB): findable
|
||||||
|
│ DOI State (DataCite): findable
|
||||||
|
│ Dataset Modified: 2024-09-15T10:30:00.000Z
|
||||||
|
│ DOI Modified: 2024-09-10T08:15:00.000Z
|
||||||
|
│ Needs Update: YES - Dataset newer than DOI
|
||||||
|
└───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
┌─ Dataset 267 ─────────────────────────────────────────────────────────
|
||||||
|
│ DOI Value: 10.21388/tethys.267
|
||||||
|
│ DOI Status (DB): findable
|
||||||
|
│ DOI State (DataCite): findable
|
||||||
|
│ Dataset Modified: 2024-09-18T14:20:00.000Z
|
||||||
|
│ DOI Modified: 2024-09-16T12:45:00.000Z
|
||||||
|
│ Needs Update: YES - Dataset newer than DOI
|
||||||
|
└───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
DataCite Stats Summary: 2 datasets need updating, 48 are up to date
|
||||||
|
```
|
||||||
|
|
||||||
|
## Update Logic
|
||||||
|
|
||||||
|
The command uses intelligent update detection:
|
||||||
|
|
||||||
|
1. **Compares modification dates**: Dataset `server_date_modified` vs DOI last modification date from DataCite
|
||||||
|
2. **Validates data integrity**: Checks for missing or future dates
|
||||||
|
3. **Handles API failures gracefully**: Updates anyway if DataCite info can't be retrieved
|
||||||
|
4. **Uses dual API approach**: DataCite REST API (primary) with MDS API fallback
|
||||||
|
|
||||||
|
### When Updates Happen
|
||||||
|
|
||||||
|
| Condition | Action | Reason |
|
||||||
|
|-----------|--------|--------|
|
||||||
|
| Dataset modified > DOI modified | ✅ Update | Dataset has newer changes |
|
||||||
|
| Dataset modified ≤ DOI modified | ❌ Skip | DOI is up to date |
|
||||||
|
| Dataset date in future | ❌ Skip | Invalid data, needs investigation |
|
||||||
|
| Dataset date missing | ✅ Update | Can't determine staleness |
|
||||||
|
| DataCite API error | ✅ Update | Better safe than sorry |
|
||||||
|
| `--force` flag used | ✅ Update | Override all logic |
|
||||||
|
|
||||||
|
## Environment Configuration
|
||||||
|
|
||||||
|
Required environment variables:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# DataCite Credentials
|
||||||
|
DATACITE_USERNAME=your_username
|
||||||
|
DATACITE_PASSWORD=your_password
|
||||||
|
|
||||||
|
# API Endpoints (environment-specific)
|
||||||
|
DATACITE_API_URL=https://api.test.datacite.org # Test environment
|
||||||
|
DATACITE_SERVICE_URL=https://mds.test.datacite.org # Test MDS
|
||||||
|
|
||||||
|
DATACITE_API_URL=https://api.datacite.org # Production
|
||||||
|
DATACITE_SERVICE_URL=https://mds.datacite.org # Production MDS
|
||||||
|
|
||||||
|
# Project Configuration
|
||||||
|
DATACITE_PREFIX=10.21388 # Your DOI prefix
|
||||||
|
BASE_DOMAIN=tethys.at # Your domain
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
The command handles various error scenarios:
|
||||||
|
|
||||||
|
- **Invalid modification dates**: Logs errors but continues processing other datasets
|
||||||
|
- **DataCite API failures**: Falls back to MDS API, then to safe update
|
||||||
|
- **Missing DOI identifiers**: Skips datasets without DOI identifiers
|
||||||
|
- **Network issues**: Continues with next dataset after logging error
|
||||||
|
|
||||||
|
## Integration
|
||||||
|
|
||||||
|
The command integrates with:
|
||||||
|
|
||||||
|
- **Dataset Model**: Uses `server_date_modified` for change detection
|
||||||
|
- **DatasetIdentifier Model**: Reads DOI values and status
|
||||||
|
- **OpenSearch Index**: Updates search index after DataCite update
|
||||||
|
- **DoiClient**: Handles all DataCite API interactions
|
||||||
|
|
||||||
|
## Common Workflows
|
||||||
|
|
||||||
|
### Daily Maintenance
|
||||||
|
```bash
|
||||||
|
# Update any datasets modified today
|
||||||
|
node ace update:datacite
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pre-Deployment Check
|
||||||
|
```bash
|
||||||
|
# Check what would be updated before deployment
|
||||||
|
node ace update:datacite --dry-run
|
||||||
|
```
|
||||||
|
|
||||||
|
### Debugging Sync Issues
|
||||||
|
```bash
|
||||||
|
# Investigate why specific dataset isn't syncing
|
||||||
|
node ace update:datacite --stats --publish_id 231
|
||||||
|
```
|
||||||
|
|
||||||
|
### Full Resync
|
||||||
|
```bash
|
||||||
|
# Force update all DOI records (use with caution)
|
||||||
|
node ace update:datacite --force
|
||||||
|
```
|
||||||
|
|
||||||
|
### Monitoring Report
|
||||||
|
```bash
|
||||||
|
# Generate sync status report
|
||||||
|
node ace update:datacite --stats > datacite-sync-report.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Regular Updates**: Run daily or after bulk dataset modifications
|
||||||
|
2. **Test First**: Use `--dry-run` or `--stats` before bulk operations
|
||||||
|
3. **Monitor Logs**: Check for data integrity warnings
|
||||||
|
4. **Environment Separation**: Use correct API URLs for test vs production
|
||||||
|
5. **Rate Limiting**: The command handles DataCite rate limits automatically
|
||||||
989
package-lock.json
generated
989
package-lock.json
generated
File diff suppressed because it is too large
Load diff
174
readme.md
174
readme.md
|
|
@ -11,6 +11,8 @@ Welcome to the Tethys Research Repository Backend System! This is the backend co
|
||||||
- [Configuration](#configuration)
|
- [Configuration](#configuration)
|
||||||
- [Database](#database)
|
- [Database](#database)
|
||||||
- [API Documentation](#api-documentation)
|
- [API Documentation](#api-documentation)
|
||||||
|
- [Commands](#commands)
|
||||||
|
- [Documentation](#documentation)
|
||||||
- [Contributing](#contributing)
|
- [Contributing](#contributing)
|
||||||
- [License](#license)
|
- [License](#license)
|
||||||
|
|
||||||
|
|
@ -29,5 +31,175 @@ Before you begin, ensure you have met the following requirements:
|
||||||
1. Clone this repository:
|
1. Clone this repository:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://gitea.geologie.ac.at/geolba/tethys.backend.git
|
git clone git clone https://gitea.geologie.ac.at/geolba/tethys.backend.git
|
||||||
|
cd tethys-backend
|
||||||
```
|
```
|
||||||
|
|
||||||
|
2. Install dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Configure environment variables (see [Configuration](#configuration))
|
||||||
|
|
||||||
|
4. Run database migrations:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
node ace migration:run
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Start the development server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run dev
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
The Tethys Backend provides RESTful APIs for managing research datasets, user authentication, DOI registration, and search functionality.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Copy the `.env.example` file to `.env` and configure the following variables:
|
||||||
|
|
||||||
|
### Database Configuration
|
||||||
|
```bash
|
||||||
|
DB_CONNECTION=pg
|
||||||
|
DB_HOST=localhost
|
||||||
|
DB_PORT=5432
|
||||||
|
DB_USER=your_username
|
||||||
|
DB_PASSWORD=your_password
|
||||||
|
DB_DATABASE=tethys_db
|
||||||
|
```
|
||||||
|
|
||||||
|
### DataCite Configuration
|
||||||
|
```bash
|
||||||
|
# DataCite Credentials
|
||||||
|
DATACITE_USERNAME=your_datacite_username
|
||||||
|
DATACITE_PASSWORD=your_datacite_password
|
||||||
|
DATACITE_PREFIX=10.21388
|
||||||
|
|
||||||
|
# Environment-specific API endpoints
|
||||||
|
DATACITE_API_URL=https://api.test.datacite.org # Test environment
|
||||||
|
DATACITE_SERVICE_URL=https://mds.test.datacite.org # Test MDS
|
||||||
|
|
||||||
|
# For production:
|
||||||
|
# DATACITE_API_URL=https://api.datacite.org
|
||||||
|
# DATACITE_SERVICE_URL=https://mds.datacite.org
|
||||||
|
```
|
||||||
|
|
||||||
|
### OpenSearch Configuration
|
||||||
|
```bash
|
||||||
|
OPENSEARCH_HOST=localhost:9200
|
||||||
|
```
|
||||||
|
|
||||||
|
### Application Configuration
|
||||||
|
```bash
|
||||||
|
BASE_DOMAIN=tethys.at
|
||||||
|
APP_KEY=your_app_key
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database
|
||||||
|
|
||||||
|
The system uses PostgreSQL with Lucid ORM. Key models include:
|
||||||
|
|
||||||
|
- **Dataset**: Research dataset metadata
|
||||||
|
- **DatasetIdentifier**: DOI and other identifiers for datasets
|
||||||
|
- **User**: User management and authentication
|
||||||
|
- **XmlCache**: Cached XML metadata
|
||||||
|
|
||||||
|
Run migrations and seeders:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run migrations
|
||||||
|
node ace migration:run
|
||||||
|
|
||||||
|
# Run seeders (if available)
|
||||||
|
node ace db:seed
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Documentation
|
||||||
|
|
||||||
|
API endpoints are available for:
|
||||||
|
|
||||||
|
- Dataset management (`/api/datasets`)
|
||||||
|
- User authentication (`/api/auth`)
|
||||||
|
- DOI registration (`/api/doi`)
|
||||||
|
- Search functionality (`/api/search`)
|
||||||
|
|
||||||
|
*Detailed API documentation can be found in the `/docs/api` directory.*
|
||||||
|
|
||||||
|
## Commands
|
||||||
|
|
||||||
|
The system includes several Ace commands for maintenance and data management:
|
||||||
|
|
||||||
|
### Dataset Indexing
|
||||||
|
```bash
|
||||||
|
# Index all published datasets to OpenSearch
|
||||||
|
node ace index:datasets
|
||||||
|
|
||||||
|
# Index a specific dataset
|
||||||
|
node ace index:datasets --publish_id 123
|
||||||
|
```
|
||||||
|
|
||||||
|
### DataCite DOI Management
|
||||||
|
```bash
|
||||||
|
# Update DataCite records for modified datasets
|
||||||
|
node ace update:datacite
|
||||||
|
|
||||||
|
# Show detailed statistics for datasets needing updates
|
||||||
|
node ace update:datacite --stats
|
||||||
|
|
||||||
|
# Preview what would be updated (dry run)
|
||||||
|
node ace update:datacite --dry-run
|
||||||
|
|
||||||
|
# Force update all DOI records
|
||||||
|
node ace update:datacite --force
|
||||||
|
|
||||||
|
# Update a specific dataset
|
||||||
|
node ace update:datacite --publish_id 123
|
||||||
|
```
|
||||||
|
|
||||||
|
*For detailed command documentation, see the [Commands Documentation](docs/commands/)*
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
Comprehensive documentation is available in the `/docs` directory:
|
||||||
|
|
||||||
|
- **[Commands Documentation](docs/commands/)** - Detailed guides for Ace commands
|
||||||
|
- [DataCite Update Command](docs/commands/update-datacite.md) - DOI synchronization and management
|
||||||
|
- [Dataset Indexing Command](docs/commands/index-datasets.md) - Search index management
|
||||||
|
- **[API Documentation](docs/api/)** - REST API endpoints and usage
|
||||||
|
- **[Deployment Guide](docs/deployment/)** - Production deployment instructions
|
||||||
|
- **[Configuration Guide](docs/configuration/)** - Environment setup and configuration options
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
1. Fork the repository
|
||||||
|
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
||||||
|
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
|
||||||
|
4. Push to the branch (`git push origin feature/amazing-feature`)
|
||||||
|
5. Open a Pull Request
|
||||||
|
|
||||||
|
### Development Guidelines
|
||||||
|
|
||||||
|
- Follow the existing code style and conventions
|
||||||
|
- Write tests for new features
|
||||||
|
- Update documentation for any API changes
|
||||||
|
- Ensure all commands and migrations work properly
|
||||||
|
|
||||||
|
### Testing Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run tests
|
||||||
|
npm test
|
||||||
|
|
||||||
|
# Test specific commands
|
||||||
|
node ace update:datacite --dry-run --publish_id 123
|
||||||
|
node ace index:datasets --publish_id 123
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This project is licensed under the [MIT License](LICENSE).
|
||||||
|
|
@ -8,14 +8,24 @@ import AvatarController from '#controllers/Http/Api/AvatarController';
|
||||||
import UserController from '#controllers/Http/Api/UserController';
|
import UserController from '#controllers/Http/Api/UserController';
|
||||||
import CollectionsController from '#controllers/Http/Api/collections_controller';
|
import CollectionsController from '#controllers/Http/Api/collections_controller';
|
||||||
import { middleware } from '../kernel.js';
|
import { middleware } from '../kernel.js';
|
||||||
// API
|
|
||||||
|
// Clean DOI URL routes (no /api prefix)
|
||||||
|
|
||||||
|
// API routes with /api prefix
|
||||||
router
|
router
|
||||||
.group(() => {
|
.group(() => {
|
||||||
router.get('clients', [UserController, 'getSubmitters']).as('client.index').use(middleware.auth());;
|
router.get('clients', [UserController, 'getSubmitters']).as('client.index').use(middleware.auth());
|
||||||
router.get('authors', [AuthorsController, 'index']).as('author.index').use(middleware.auth());;
|
router.get('authors', [AuthorsController, 'index']).as('author.index').use(middleware.auth());
|
||||||
router.get('datasets', [DatasetController, 'index']).as('dataset.index');
|
router.get('datasets', [DatasetController, 'index']).as('dataset.index');
|
||||||
router.get('persons', [AuthorsController, 'persons']).as('author.persons');
|
router.get('persons', [AuthorsController, 'persons']).as('author.persons');
|
||||||
|
|
||||||
|
// This should come BEFORE any other routes that might conflict
|
||||||
|
router
|
||||||
|
.get('/dataset/:prefix/:value', [DatasetController, 'findByIdentifier'])
|
||||||
|
.where('prefix', /^10\.\d+$/) // Match DOI prefix pattern (10.xxxx)
|
||||||
|
.where('value', /^[a-zA-Z0-9._-]+\.[0-9]+(?:\.[0-9]+)*$/) // Match DOI suffix pattern
|
||||||
|
.as('dataset.findByIdentifier');
|
||||||
|
|
||||||
router.get('/dataset', [DatasetController, 'findAll']).as('dataset.findAll');
|
router.get('/dataset', [DatasetController, 'findAll']).as('dataset.findAll');
|
||||||
router.get('/dataset/:publish_id', [DatasetController, 'findOne']).as('dataset.findOne');
|
router.get('/dataset/:publish_id', [DatasetController, 'findOne']).as('dataset.findOne');
|
||||||
router.get('/sitelinks/:year', [HomeController, 'findDocumentsPerYear']);
|
router.get('/sitelinks/:year', [HomeController, 'findDocumentsPerYear']);
|
||||||
|
|
@ -35,7 +45,7 @@ router
|
||||||
.as('apps.twofactor_backupcodes.create')
|
.as('apps.twofactor_backupcodes.create')
|
||||||
.use(middleware.auth());
|
.use(middleware.auth());
|
||||||
|
|
||||||
router.get('collections/:id', [CollectionsController, 'show']).as('collection.show')
|
router.get('collections/:id', [CollectionsController, 'show']).as('collection.show');
|
||||||
})
|
})
|
||||||
// .namespace('App/Controllers/Http/Api')
|
// .namespace('App/Controllers/Http/Api')
|
||||||
.prefix('api');
|
.prefix('api');
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
/*
|
/*
|
||||||
|--------------------------------------------------------------------------
|
|--------------------------------------------------------------------------
|
||||||
| Preloaded File - node ace make:preload rules/orcid
|
| Preloaded File - node ace make:preload rules/orcid
|
||||||
| ❯ Do you want to register the preload file in .adonisrc.ts file? (y/N) · true
|
| Do you want to register the preload file in .adonisrc.ts file? (y/N) · true
|
||||||
| DONE: create start/rules/orcid.ts
|
| DONE: create start/rules/orcid.ts
|
||||||
| DONE: update adonisrc.ts file
|
| DONE: update adonisrc.ts file
|
||||||
|--------------------------------------------------------------------------
|
|--------------------------------------------------------------------------
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue