- feat: Enhance README with setup instructions, usage, and command documentation

- fix: Update API routes to include DOI URL handling and improve route organization

- chore: Add ORCID preload rule file and ensure proper registration

- docs: Add MIT License to the project for open-source compliance

- feat: Implement command to detect and fix missing dataset cross-references

- feat: Create command for updating DataCite DOI records with detailed logging and error handling

- docs: Add comprehensive documentation for dataset indexing command

- docs: Create detailed documentation for DataCite update command with usage examples and error handling
This commit is contained in:
Kaimbacher 2025-09-19 14:35:23 +02:00
commit c049b22723
11 changed files with 2187 additions and 555 deletions

22
LICENSE Normal file
View file

@ -0,0 +1,22 @@
MIT License
Copyright (c) 2025 Tethys Research Repository
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

View file

@ -1,23 +1,35 @@
import type { HttpContext } from '@adonisjs/core/http'; import type { HttpContext } from '@adonisjs/core/http';
// import Person from 'App/Models/Person';
import Dataset from '#models/dataset'; import Dataset from '#models/dataset';
import { StatusCodes } from 'http-status-codes'; import { StatusCodes } from 'http-status-codes';
// node ace make:controller Author // node ace make:controller Author
export default class DatasetController { export default class DatasetController {
public async index({}: HttpContext) { /**
// Select datasets with server_state 'published' or 'deleted' and sort by the last published date * GET /api/datasets
const datasets = await Dataset.query() * Find all published datasets
.where(function (query) { */
query.where('server_state', 'published').orWhere('server_state', 'deleted'); public async index({ response }: HttpContext) {
}) try {
.preload('titles') const datasets = await Dataset.query()
.preload('identifier') .where(function (query) {
.orderBy('server_date_published', 'desc'); query.where('server_state', 'published').orWhere('server_state', 'deleted');
})
.preload('titles')
.preload('identifier')
.orderBy('server_date_published', 'desc');
return datasets; return response.status(StatusCodes.OK).json(datasets);
} catch (error) {
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
message: error.message || 'Some error occurred while retrieving datasets.',
});
}
} }
/**
* GET /api/dataset
* Find all published datasets
*/
public async findAll({ response }: HttpContext) { public async findAll({ response }: HttpContext) {
try { try {
const datasets = await Dataset.query() const datasets = await Dataset.query()
@ -33,48 +45,142 @@ export default class DatasetController {
} }
} }
public async findOne({ params }: HttpContext) { /**
const datasets = await Dataset.query() * GET /api/dataset/:publish_id
.where('publish_id', params.publish_id) * Find one dataset by publish_id
.preload('titles') */
.preload('descriptions') public async findOne({ response, params }: HttpContext) {
.preload('user', (builder) => { try {
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']); const dataset = await Dataset.query()
}) .where('publish_id', params.publish_id)
.preload('authors', (builder) => { .preload('titles')
builder .preload('descriptions') // Using 'descriptions' instead of 'abstracts'
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type']) .preload('user', (builder) => {
.withCount('datasets', (query) => { builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
query.as('datasets_count'); })
}) .preload('authors', (builder) => {
.pivotColumns(['role', 'sort_order']) builder
.orderBy('pivot_sort_order', 'asc'); .select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
}) .withCount('datasets', (query) => {
.preload('contributors', (builder) => { query.as('datasets_count');
builder })
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type']) .pivotColumns(['role', 'sort_order'])
.withCount('datasets', (query) => { .orderBy('pivot_sort_order', 'asc');
query.as('datasets_count'); })
}) .preload('contributors', (builder) => {
.pivotColumns(['role', 'sort_order', 'contributor_type']) builder
.orderBy('pivot_sort_order', 'asc'); .select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
}) .withCount('datasets', (query) => {
.preload('subjects') query.as('datasets_count');
.preload('coverage') })
.preload('licenses') .pivotColumns(['role', 'sort_order', 'contributor_type'])
.preload('references') .orderBy('pivot_sort_order', 'asc');
.preload('project') })
.preload('referenced_by', (builder) => { .preload('subjects')
builder.preload('dataset', (builder) => { .preload('coverage')
builder.preload('identifier'); .preload('licenses')
}); .preload('references')
}) .preload('project')
.preload('files', (builder) => { .preload('referenced_by', (builder) => {
builder.preload('hashvalues'); builder.preload('dataset', (builder) => {
}) builder.preload('identifier');
.preload('identifier') });
.firstOrFail(); })
.preload('files', (builder) => {
builder.preload('hashvalues');
})
.preload('identifier')
.first(); // Use first() instead of firstOrFail() to handle not found gracefully
return datasets; if (!dataset) {
return response.status(StatusCodes.NOT_FOUND).json({
message: `Cannot find Dataset with publish_id=${params.publish_id}.`,
});
}
return response.status(StatusCodes.OK).json(dataset);
} catch (error) {
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
message: error.message || `Error retrieving Dataset with publish_id=${params.publish_id}.`,
});
}
}
/**
* GET /:prefix/:value
* Find dataset by identifier (e.g., https://doi.tethys.at/10.24341/tethys.99.2)
*/
public async findByIdentifier({ response, params }: HttpContext) {
const identifierValue = `${params.prefix}/${params.value}`;
// Optional: Validate DOI format
if (!identifierValue.match(/^10\.\d+\/[a-zA-Z0-9._-]+\.[0-9]+(?:\.[0-9]+)*$/)) {
return response.status(StatusCodes.BAD_REQUEST).json({
message: `Invalid DOI format: ${identifierValue}`,
});
}
try {
// Method 1: Using subquery with whereIn (most similar to your original)
const dataset = await Dataset.query()
// .whereIn('id', (subQuery) => {
// subQuery.select('dataset_id').from('dataset_identifiers').where('value', identifierValue);
// })
.whereHas('identifier', (builder) => {
builder.where('value', identifierValue);
})
.preload('titles')
.preload('descriptions') // Using 'descriptions' instead of 'abstracts'
.preload('user', (builder) => {
builder.select(['id', 'firstName', 'lastName', 'avatar', 'login']);
})
.preload('authors', (builder) => {
builder
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
.withCount('datasets', (query) => {
query.as('datasets_count');
})
.pivotColumns(['role', 'sort_order'])
.wherePivot('role', 'author')
.orderBy('pivot_sort_order', 'asc');
})
.preload('contributors', (builder) => {
builder
.select(['id', 'academic_title', 'first_name', 'last_name', 'identifier_orcid', 'status', 'name_type'])
.withCount('datasets', (query) => {
query.as('datasets_count');
})
.pivotColumns(['role', 'sort_order', 'contributor_type'])
.wherePivot('role', 'contributor')
.orderBy('pivot_sort_order', 'asc');
})
.preload('subjects')
.preload('coverage')
.preload('licenses')
.preload('references')
.preload('project')
.preload('referenced_by', (builder) => {
builder.preload('dataset', (builder) => {
builder.preload('identifier');
});
})
.preload('files', (builder) => {
builder.preload('hashvalues');
})
.preload('identifier')
.first();
if (!dataset) {
return response.status(StatusCodes.NOT_FOUND).json({
message: `Cannot find Dataset with identifier=${identifierValue}.`,
});
}
return response.status(StatusCodes.OK).json(dataset);
} catch (error) {
return response.status(StatusCodes.INTERNAL_SERVER_ERROR).json({
message: error.message || `Error retrieving Dataset with identifier=${identifierValue}.`,
});
}
} }
} }

View file

@ -1,6 +1,3 @@
// import { Client } from 'guzzle';
// import { Log } from '@adonisjs/core/build/standalone';
// import { DoiInterface } from './interfaces/DoiInterface';
import DoiClientContract from '#app/Library/Doi/DoiClientContract'; import DoiClientContract from '#app/Library/Doi/DoiClientContract';
import DoiClientException from '#app/exceptions/DoiClientException'; import DoiClientException from '#app/exceptions/DoiClientException';
import { StatusCodes } from 'http-status-codes'; import { StatusCodes } from 'http-status-codes';
@ -12,14 +9,14 @@ export class DoiClient implements DoiClientContract {
public username: string; public username: string;
public password: string; public password: string;
public serviceUrl: string; public serviceUrl: string;
public apiUrl: string;
constructor() { constructor() {
// const datacite_environment = process.env.DATACITE_ENVIRONMENT || 'debug'; // const datacite_environment = process.env.DATACITE_ENVIRONMENT || 'debug';
this.username = process.env.DATACITE_USERNAME || ''; this.username = process.env.DATACITE_USERNAME || '';
this.password = process.env.DATACITE_PASSWORD || ''; this.password = process.env.DATACITE_PASSWORD || '';
this.serviceUrl = process.env.DATACITE_SERVICE_URL || ''; this.serviceUrl = process.env.DATACITE_SERVICE_URL || '';
// this.prefix = process.env.DATACITE_PREFIX || ''; this.apiUrl = process.env.DATACITE_API_URL || 'https://api.datacite.org';
// this.base_domain = process.env.BASE_DOMAIN || '';
if (this.username === '' || this.password === '' || this.serviceUrl === '') { if (this.username === '' || this.password === '' || this.serviceUrl === '') {
const message = 'issing configuration settings to properly initialize DOI client'; const message = 'issing configuration settings to properly initialize DOI client';
@ -90,4 +87,240 @@ export class DoiClient implements DoiClientContract {
throw new DoiClientException(error.response.status, error.response.data); throw new DoiClientException(error.response.status, error.response.data);
} }
} }
/**
* Retrieves DOI information from DataCite REST API
*
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
* @returns Promise with DOI information or null if not found
*/
public async getDoiInfo(doiValue: string): Promise<any | null> {
try {
// Use configurable DataCite REST API URL
const dataciteApiUrl = `${this.apiUrl}/dois/${doiValue}`;
const response = await axios.get(dataciteApiUrl, {
headers: {
Accept: 'application/vnd.api+json',
},
});
if (response.status === 200 && response.data.data) {
return {
created: response.data.data.attributes.created,
registered: response.data.data.attributes.registered,
updated: response.data.data.attributes.updated,
published: response.data.data.attributes.published,
state: response.data.data.attributes.state,
url: response.data.data.attributes.url,
metadata: response.data.data.attributes,
};
}
} catch (error) {
if (error.response?.status === 404) {
logger.debug(`DOI ${doiValue} not found in DataCite`);
return null;
}
logger.debug(`DataCite REST API failed for ${doiValue}: ${error.message}`);
// Fallback to MDS API
return await this.getDoiInfoFromMds(doiValue);
}
return null;
}
/**
* Fallback method to get DOI info from MDS API
*
* @param doiValue The DOI identifier
* @returns Promise with basic DOI information or null
*/
private async getDoiInfoFromMds(doiValue: string): Promise<any | null> {
try {
const auth = {
username: this.username,
password: this.password,
};
// Get DOI URL
const doiResponse = await axios.get(`${this.serviceUrl}/doi/${doiValue}`, { auth });
if (doiResponse.status === 200) {
// Get metadata if available
try {
const metadataResponse = await axios.get(`${this.serviceUrl}/metadata/${doiValue}`, {
auth,
headers: {
Accept: 'application/xml',
},
});
return {
url: doiResponse.data.trim(),
metadata: metadataResponse.data,
created: new Date().toISOString(), // MDS doesn't provide creation dates
registered: new Date().toISOString(), // Use current time as fallback
source: 'mds',
};
} catch (metadataError) {
// Return basic info even if metadata fetch fails
return {
url: doiResponse.data.trim(),
created: new Date().toISOString(),
registered: new Date().toISOString(),
source: 'mds',
};
}
}
} catch (error) {
if (error.response?.status === 404) {
logger.debug(`DOI ${doiValue} not found in DataCite MDS`);
return null;
}
logger.debug(`DataCite MDS API failed for ${doiValue}: ${error.message}`);
}
return null;
}
/**
* Checks if a DOI exists in DataCite
*
* @param doiValue The DOI identifier
* @returns Promise<boolean> True if DOI exists
*/
public async doiExists(doiValue: string): Promise<boolean> {
const doiInfo = await this.getDoiInfo(doiValue);
return doiInfo !== null;
}
/**
* Gets the last modification date of a DOI
*
* @param doiValue The DOI identifier
* @returns Promise<Date | null> Last modification date or creation date if never updated, null if not found
*/
public async getDoiLastModified(doiValue: string): Promise<Date | null> {
const doiInfo = await this.getDoiInfo(doiValue);
if (doiInfo) {
// Use updated date if available, otherwise fall back to created/registered date
const dateToUse = doiInfo.updated || doiInfo.registered || doiInfo.created;
if (dateToUse) {
logger.debug(
`DOI ${doiValue}: Using ${doiInfo.updated ? 'updated' : doiInfo.registered ? 'registered' : 'created'} date: ${dateToUse}`,
);
return new Date(dateToUse);
}
}
return null;
}
/**
* Makes a DOI unfindable (registered but not discoverable)
* Note: DOIs cannot be deleted, only made unfindable
* await doiClient.makeDoiUnfindable('10.21388/tethys.231');
*
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
* @returns Promise<AxiosResponse<any>> The http response
*/
public async makeDoiUnfindable(doiValue: string): Promise<AxiosResponse<any>> {
const auth = {
username: this.username,
password: this.password,
};
try {
// First, check if DOI exists
const exists = await this.doiExists(doiValue);
if (!exists) {
throw new DoiClientException(404, `DOI ${doiValue} not found`);
}
// Delete the DOI URL mapping to make it unfindable
// This removes the URL but keeps the metadata registered
const response = await axios.delete(`${this.serviceUrl}/doi/${doiValue}`, { auth });
// Response Codes for DELETE /doi/{doi}
// 200 OK: operation successful
// 401 Unauthorized: no login
// 403 Forbidden: login problem, quota exceeded
// 404 Not Found: DOI does not exist
if (response.status !== 200) {
const message = `Unexpected DataCite MDS response code ${response.status}`;
logger.error(message);
throw new DoiClientException(response.status, message);
}
logger.info(`DOI ${doiValue} successfully made unfindable`);
return response;
} catch (error) {
logger.error(`Failed to make DOI ${doiValue} unfindable: ${error.message}`);
if (error instanceof DoiClientException) {
throw error;
}
throw new DoiClientException(error.response?.status || 500, error.response?.data || error.message);
}
}
/**
* Makes a DOI findable again by re-registering the URL
* await doiClient.makeDoiFindable(
* '10.21388/tethys.231',
* 'https://doi.dev.tethys.at/10.21388/tethys.231'
* );
*
* @param doiValue The DOI identifier e.g. '10.5072/tethys.999'
* @param landingPageUrl The landing page URL
* @returns Promise<AxiosResponse<any>> The http response
*/
public async makeDoiFindable(doiValue: string, landingPageUrl: string): Promise<AxiosResponse<any>> {
const auth = {
username: this.username,
password: this.password,
};
try {
// Re-register the DOI with its URL to make it findable again
const response = await axios.put(`${this.serviceUrl}/doi/${doiValue}`, `doi=${doiValue}\nurl=${landingPageUrl}`, { auth });
// Response Codes for PUT /doi/{doi}
// 201 Created: operation successful
// 400 Bad Request: request body must be exactly two lines: DOI and URL
// 401 Unauthorized: no login
// 403 Forbidden: login problem, quota exceeded
// 412 Precondition failed: metadata must be uploaded first
if (response.status !== 201) {
const message = `Unexpected DataCite MDS response code ${response.status}`;
logger.error(message);
throw new DoiClientException(response.status, message);
}
logger.info(`DOI ${doiValue} successfully made findable again`);
return response;
} catch (error) {
logger.error(`Failed to make DOI ${doiValue} findable: ${error.message}`);
if (error instanceof DoiClientException) {
throw error;
}
throw new DoiClientException(error.response?.status || 500, error.response?.data || error.message);
}
}
/**
* Gets the current state of a DOI (draft, registered, findable)
* const state = await doiClient.getDoiState('10.21388/tethys.231');
* console.log(`Current state: ${state}`); // 'findable'
*
* @param doiValue The DOI identifier
* @returns Promise<string | null> The DOI state or null if not found
*/
public async getDoiState(doiValue: string): Promise<string | null> {
const doiInfo = await this.getDoiInfo(doiValue);
return doiInfo?.state || null;
}
} }

View file

@ -0,0 +1,317 @@
/*
|--------------------------------------------------------------------------
| node ace make:command fix-dataset-cross-references
| DONE: create commands/fix_dataset_cross_references.ts
|--------------------------------------------------------------------------
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import type { CommandOptions } from '@adonisjs/core/types/ace';
import Dataset from '#models/dataset';
import DatasetReference from '#models/dataset_reference';
// import env from '#start/env';
interface MissingCrossReference {
sourceDatasetId: number;
targetDatasetId: number;
sourcePublishId: number | null;
targetPublishId: number | null;
referenceType: string;
relation: string;
doi: string | null;
reverseRelation: string;
}
export default class DetectMissingCrossReferences extends BaseCommand {
static commandName = 'detect:missing-cross-references';
static description = 'Detect missing bidirectional cross-references between versioned datasets';
public static needsApplication = true;
@flags.boolean({ alias: 'f', description: 'Fix missing cross-references automatically' })
public fix: boolean = false;
@flags.boolean({ alias: 'v', description: 'Verbose output' })
public verbose: boolean = false;
public static options: CommandOptions = {
startApp: true,
staysAlive: false,
};
async run() {
this.logger.info('🔍 Detecting missing cross-references...');
try {
const missingReferences = await this.findMissingCrossReferences();
if (missingReferences.length === 0) {
this.logger.success('All cross-references are properly linked!');
return;
}
this.logger.warning(`Found ${missingReferences.length} missing cross-reference(s):`);
for (const missing of missingReferences) {
this.logger.info(
`Dataset ${missing.sourceDatasetId} references ${missing.targetDatasetId}, but reverse reference is missing`,
);
if (this.verbose) {
this.logger.info(` - Reference type: ${missing.referenceType}`);
this.logger.info(` - Relation: ${missing.relation}`);
this.logger.info(` - DOI: ${missing.doi}`);
}
}
if (this.fix) {
await this.fixMissingReferences(missingReferences);
this.logger.success('All missing cross-references have been fixed!');
} else {
this.printMissingReferencesList(missingReferences);
this.logger.info('💡 Run with --fix flag to automatically create missing cross-references');
}
} catch (error) {
this.logger.error('Error detecting missing cross-references:', error);
process.exit(1);
}
}
private async findMissingCrossReferences(): Promise<MissingCrossReference[]> {
const missingReferences: {
sourceDatasetId: number;
targetDatasetId: number;
sourcePublishId: number | null;
targetPublishId: number | null;
referenceType: string;
relation: string;
doi: string | null;
reverseRelation: string;
}[] = [];
this.logger.info('📊 Querying dataset references...');
// Find all references that point to Tethys datasets (DOI or URL containing tethys DOI)
// Only from datasets that are published
const tethysReferences = await DatasetReference.query()
.whereIn('type', ['DOI', 'URL'])
.where((query) => {
query.where('value', 'like', '%doi.org/10.24341/tethys.%').orWhere('value', 'like', '%tethys.at/dataset/%');
})
.preload('dataset', (datasetQuery) => {
datasetQuery.where('server_state', 'published');
})
.whereHas('dataset', (datasetQuery) => {
datasetQuery.where('server_state', 'published');
});
this.logger.info(`🔗 Found ${tethysReferences.length} Tethys references from published datasets`);
let processedCount = 0;
for (const reference of tethysReferences) {
processedCount++;
if (this.verbose && processedCount % 10 === 0) {
this.logger.info(`📈 Processed ${processedCount}/${tethysReferences.length} references...`);
}
// Extract dataset publish_id from DOI or URL
const targetDatasetPublish = this.extractDatasetPublishIdFromReference(reference.value);
if (!targetDatasetPublish) {
if (this.verbose) {
this.logger.warning(`⚠️ Could not extract publish ID from: ${reference.value}`);
}
continue;
}
// Check if target dataset exists and is published
const targetDataset = await Dataset.query()
.where('publish_id', targetDatasetPublish)
.where('server_state', 'published')
.first();
if (!targetDataset) {
if (this.verbose) {
this.logger.warning(`⚠️ Target dataset with publish_id ${targetDatasetPublish} not found or not published`);
}
continue;
}
// Ensure we have a valid source dataset with proper preloading
if (!reference.dataset) {
this.logger.warning(`⚠️ Source dataset ${reference.document_id} not properly loaded, skipping...`);
continue;
}
// Check if reverse reference exists
const reverseReferenceExists = await this.checkReverseReferenceExists(
targetDataset.id,
reference.document_id,
reference.relation,
);
if (!reverseReferenceExists) {
missingReferences.push({
sourceDatasetId: reference.document_id,
targetDatasetId: targetDataset.id,
sourcePublishId: reference.dataset.publish_id || null,
targetPublishId: targetDataset.publish_id || null,
referenceType: reference.type,
relation: reference.relation,
doi: reference.value,
reverseRelation: this.getReverseRelation(reference.relation),
});
}
}
this.logger.info(`✅ Processed all ${processedCount} references`);
return missingReferences;
}
private extractDatasetPublishIdFromReference(value: string): number | null {
// Extract from DOI: https://doi.org/10.24341/tethys.107 -> 107
const doiMatch = value.match(/10\.24341\/tethys\.(\d+)/);
if (doiMatch) {
return parseInt(doiMatch[1]);
}
// Extract from URL: https://tethys.at/dataset/107 -> 107
const urlMatch = value.match(/tethys\.at\/dataset\/(\d+)/);
if (urlMatch) {
return parseInt(urlMatch[1]);
}
return null;
}
private async checkReverseReferenceExists(
sourceDatasetId: number,
targetDatasetId: number,
originalRelation: string,
): Promise<boolean> {
const reverseRelation = this.getReverseRelation(originalRelation);
// Only check for reverse references where the source dataset is also published
const reverseReference = await DatasetReference.query()
.where('document_id', sourceDatasetId)
.where('related_document_id', targetDatasetId)
.where('relation', reverseRelation)
.whereHas('dataset', (datasetQuery) => {
datasetQuery.where('server_state', 'published');
})
.first();
return !!reverseReference;
}
private getReverseRelation(relation: string): string {
const relationMap: Record<string, string> = {
IsNewVersionOf: 'IsPreviousVersionOf',
IsPreviousVersionOf: 'IsNewVersionOf',
IsVersionOf: 'HasVersion',
HasVersion: 'IsVersionOf',
Compiles: 'IsCompiledBy',
IsCompiledBy: 'Compiles',
IsVariantFormOf: 'IsOriginalFormOf',
IsOriginalFormOf: 'IsVariantFormOf',
IsPartOf: 'HasPart',
HasPart: 'IsPartOf',
IsSupplementTo: 'IsSupplementedBy',
IsSupplementedBy: 'IsSupplementTo',
Continues: 'IsContinuedBy',
IsContinuedBy: 'Continues',
};
// to catch relation types like 'compiles' or 'IsVariantFormOf' that are not in the map mark reverse as 'HasVersion'
return relationMap[relation] || 'HasVersion'; // Default fallback
}
private printMissingReferencesList(missingReferences: MissingCrossReference[]) {
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
console.log('│ MISSING CROSS-REFERENCES REPORT │');
console.log('│ (Published Datasets Only) │');
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
console.log();
missingReferences.forEach((missing, index) => {
console.log(
`${index + 1}. Dataset ${missing.sourceDatasetId} (Publish ID: ${missing.sourcePublishId}) → Dataset ${missing.targetDatasetId} (Publish ID: ${missing.targetPublishId})`,
);
console.log(` ├─ Current relation: "${missing.relation}"`);
console.log(` ├─ Missing reverse relation: "${missing.reverseRelation}"`);
console.log(` ├─ Reference type: ${missing.referenceType}`);
console.log(` └─ DOI/URL: ${missing.doi}`);
console.log();
});
console.log('┌─────────────────────────────────────────────────────────────────────────────────┐');
console.log(`│ SUMMARY: ${missingReferences.length} missing reverse reference(s) detected │`);
console.log('└─────────────────────────────────────────────────────────────────────────────────┘');
}
private async fixMissingReferences(missingReferences: MissingCrossReference[]) {
this.logger.info('🔧 Creating missing cross-references in database...');
let fixedCount = 0;
let errorCount = 0;
for (const [index, missing] of missingReferences.entries()) {
try {
// Get the source dataset to create proper reference - ensure it's published
const sourceDataset = await Dataset.query()
.where('id', missing.sourceDatasetId)
.where('server_state', 'published')
.preload('identifier')
.first();
if (!sourceDataset) {
this.logger.warning(`⚠️ Source dataset ${missing.sourceDatasetId} not found or not published, skipping...`);
errorCount++;
continue;
}
// Create the reverse reference
const reverseReference = new DatasetReference();
reverseReference.document_id = missing.targetDatasetId;
reverseReference.related_document_id = missing.sourceDatasetId;
reverseReference.type = 'DOI';
reverseReference.relation = missing.reverseRelation;
// Use the source dataset's DOI for the value
if (sourceDataset.identifier?.value) {
reverseReference.value = `https://doi.org/${sourceDataset.identifier.value}`;
} else {
// Fallback to dataset URL if no DOI
reverseReference.value = `https://tethys.at/dataset/${sourceDataset.publish_id || missing.sourceDatasetId}`;
}
// Use the source dataset's main title for the label
reverseReference.label = sourceDataset.mainTitle || `Dataset ${missing.sourceDatasetId}`;
await reverseReference.save();
fixedCount++;
if (this.verbose) {
this.logger.info(
`✅ [${index + 1}/${missingReferences.length}] Created reverse reference: Dataset ${missing.targetDatasetId} -> ${missing.sourceDatasetId}`,
);
} else if ((index + 1) % 10 === 0) {
this.logger.info(`📈 Fixed ${fixedCount}/${missingReferences.length} references...`);
}
} catch (error) {
this.logger.error(
`❌ Error creating reverse reference for datasets ${missing.targetDatasetId} -> ${missing.sourceDatasetId}:`,
error,
);
errorCount++;
}
}
this.logger.info(`📊 Fix completed: ${fixedCount} created, ${errorCount} errors`);
}
}

271
commands/update_datacite.ts Normal file
View file

@ -0,0 +1,271 @@
/*
|--------------------------------------------------------------------------
| node ace make:command update-datacite
| DONE: create commands/update_datacite.ts
|--------------------------------------------------------------------------
*/
import { BaseCommand, flags } from '@adonisjs/core/ace';
import { CommandOptions } from '@adonisjs/core/types/ace';
import Dataset from '#models/dataset';
import { DoiClient } from '#app/Library/Doi/DoiClient';
import DoiClientException from '#app/exceptions/DoiClientException';
import Index from '#app/Library/Utils/Index';
import env from '#start/env';
import logger from '@adonisjs/core/services/logger';
import { DateTime } from 'luxon';
import { getDomain } from '#app/utils/utility-functions';
export default class UpdateDatacite extends BaseCommand {
static commandName = 'update:datacite';
static description = 'Update DataCite DOI records for published datasets';
public static needsApplication = true;
@flags.number({ alias: 'p', description: 'Specific publish_id to update' })
public publish_id: number;
@flags.boolean({ alias: 'f', description: 'Force update all records regardless of modification date' })
public force: boolean = false;
@flags.boolean({ alias: 'd', description: 'Dry run - show what would be updated without making changes' })
public dryRun: boolean = false;
@flags.boolean({ alias: 's', description: 'Show detailed stats for each dataset that needs updating' })
public stats: boolean = false;
//example: node ace update:datacite -p 123 --force --dry-run
public static options: CommandOptions = {
startApp: true, // Whether to boot the application before running the command
stayAlive: false, // Whether to keep the process alive after the command has executed
};
async run() {
logger.info('Starting DataCite update process...');
const prefix = env.get('DATACITE_PREFIX', '');
const base_domain = env.get('BASE_DOMAIN', '');
const apiUrl = env.get('DATACITE_API_URL', 'https://api.datacite.org');
if (!prefix || !base_domain) {
logger.error('Missing DATACITE_PREFIX or BASE_DOMAIN environment variables');
return;
}
logger.info(`Using DataCite API: ${apiUrl}`);
const datasets = await this.getDatasets();
logger.info(`Found ${datasets.length} datasets to process`);
let updated = 0;
let skipped = 0;
let errors = 0;
for (const dataset of datasets) {
try {
const shouldUpdate = this.force || (await this.shouldUpdateDataset(dataset));
if (this.stats) {
// Stats mode: show detailed information for datasets that need updating
if (shouldUpdate) {
await this.showDatasetStats(dataset);
updated++;
} else {
skipped++;
}
continue;
}
if (!shouldUpdate) {
logger.info(`Dataset ${dataset.publish_id}: Up to date, skipping`);
skipped++;
continue;
}
if (this.dryRun) {
logger.info(`Dataset ${dataset.publish_id}: Would update DataCite record (dry run)`);
updated++;
continue;
}
await this.updateDataciteRecord(dataset, prefix, base_domain);
logger.info(`Dataset ${dataset.publish_id}: Successfully updated DataCite record`);
updated++;
} catch (error) {
logger.error(`Dataset ${dataset.publish_id}: Failed to update - ${error.message}`);
errors++;
}
}
if (this.stats) {
logger.info(`\nDataCite Stats Summary: ${updated} datasets need updating, ${skipped} are up to date`);
} else {
logger.info(`DataCite update completed. Updated: ${updated}, Skipped: ${skipped}, Errors: ${errors}`);
}
}
private async getDatasets(): Promise<Dataset[]> {
const query = Dataset.query()
.preload('identifier')
.preload('xmlCache')
.where('server_state', 'published')
.whereHas('identifier', (identifierQuery) => {
identifierQuery.where('type', 'doi');
});
if (this.publish_id) {
query.where('publish_id', this.publish_id);
}
return await query.exec();
}
private async shouldUpdateDataset(dataset: Dataset): Promise<boolean> {
try {
// Check if dataset has a DOI identifier (HasOne relationship)
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
// Try to load the relationship if not already loaded
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
logger.warn(`Dataset ${dataset.publish_id}: No DOI identifier found`);
return false;
}
// Validate dataset modification date
const datasetModified = dataset.server_date_modified;
const now = DateTime.now();
if (!datasetModified) {
logger.error(`Dataset ${dataset.publish_id}: server_date_modified is null or undefined`);
return true; // Update anyway if modification date is missing
}
if (datasetModified > now) {
logger.error(
`Dataset ${dataset.publish_id}: server_date_modified (${datasetModified.toISO()}) is in the future! ` +
`Current time: ${now.toISO()}. This indicates a data integrity issue. Skipping update.`,
);
return false; // Do not update when modification date is invalid
}
// Get DOI information from DataCite using DoiClient
const doiClient = new DoiClient();
const doiLastModified = await doiClient.getDoiLastModified(doiIdentifier.value);
if (!doiLastModified) {
logger.warn(`Dataset ${dataset.publish_id}: Could not retrieve DOI modification date from DataCite`);
return true; // Update anyway if we can't get DOI info
}
// Compare dataset modification date with DOI modification date
const doiModified = DateTime.fromJSDate(doiLastModified);
logger.debug(
`Dataset ${dataset.publish_id}: Dataset modified: ${datasetModified.toISO()}, DOI modified: ${doiModified.toISO()}`,
);
// Update if dataset was modified after the DOI record
return datasetModified > doiModified;
} catch (error) {
logger.warn(`Error checking update status for dataset ${dataset.publish_id}: ${error.message}`);
return true; // Update anyway if we can't determine status
}
}
private async updateDataciteRecord(dataset: Dataset, prefix: string, base_domain: string): Promise<void> {
try {
// Get the DOI identifier (HasOne relationship)
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
if (!doiIdentifier || doiIdentifier.type !== 'doi') {
throw new Error('No DOI identifier found for dataset');
}
// Generate XML metadata
const xmlMeta = (await Index.getDoiRegisterString(dataset)) as string;
if (!xmlMeta) {
throw new Error('Failed to generate XML metadata');
}
// Construct DOI value and landing page URL
const doiValue = doiIdentifier.value; // Use existing DOI value
const landingPageUrl = `https://doi.${getDomain(base_domain)}/${doiValue}`;
// Update DataCite record
const doiClient = new DoiClient();
const dataciteResponse = await doiClient.registerDoi(doiValue, xmlMeta, landingPageUrl);
if (dataciteResponse?.status === 201) {
// // Update dataset modification date
// dataset.server_date_modified = DateTime.now();
// await dataset.save();
// // Update search index
// const index_name = 'tethys-records';
// await Index.indexDocument(dataset, index_name);
logger.debug(`Dataset ${dataset.publish_id}: DataCite record and search index updated successfully`);
} else {
throw new DoiClientException(
dataciteResponse?.status || 500,
`Unexpected DataCite response code: ${dataciteResponse?.status}`,
);
}
} catch (error) {
if (error instanceof DoiClientException) {
throw error;
}
throw new Error(`Failed to update DataCite record: ${error.message}`);
}
}
/**
* Shows detailed statistics for a dataset that needs updating
*/
private async showDatasetStats(dataset: Dataset): Promise<void> {
try {
let doiIdentifier = dataset.identifier;
if (!doiIdentifier) {
await dataset.load('identifier');
doiIdentifier = dataset.identifier;
}
const doiValue = doiIdentifier?.value || 'N/A';
const doiStatus = doiIdentifier?.status || 'N/A';
const datasetModified = dataset.server_date_modified;
// Get DOI info from DataCite
const doiClient = new DoiClient();
const doiLastModified = await doiClient.getDoiLastModified(doiValue);
const doiState = await doiClient.getDoiState(doiValue);
console.log(`
Dataset ${dataset.publish_id}
DOI Value: ${doiValue}
DOI Status (DB): ${doiStatus}
DOI State (DataCite): ${doiState || 'Unknown'}
Dataset Modified: ${datasetModified ? datasetModified.toISO() : 'N/A'}
DOI Modified: ${doiLastModified ? DateTime.fromJSDate(doiLastModified).toISO() : 'N/A'}
Needs Update: YES - Dataset newer than DOI
`);
} catch (error) {
console.log(`
Dataset ${dataset.publish_id}
DOI Value: ${dataset.identifier?.value || 'N/A'}
Error: ${error.message}
Needs Update: YES - Error checking status
`);
}
}
}

View file

@ -0,0 +1,278 @@
# Dataset Indexing Command
AdonisJS Ace command for indexing and synchronizing published datasets with OpenSearch for search functionality.
## Overview
The `index:datasets` command processes published datasets and creates/updates corresponding search index documents in OpenSearch. It intelligently compares modification timestamps to only re-index datasets when necessary, optimizing performance while maintaining search index accuracy.
## Command Syntax
```bash
node ace index:datasets [options]
```
## Options
| Flag | Alias | Description |
|------|-------|-------------|
| `--publish_id <number>` | `-p` | Index a specific dataset by publish_id |
## Usage Examples
### Basic Operations
```bash
# Index all published datasets that have been modified since last indexing
node ace index:datasets
# Index a specific dataset by publish_id
node ace index:datasets --publish_id 231
node ace index:datasets -p 231
```
## How It Works
### 1. **Dataset Selection**
The command processes datasets that meet these criteria:
- `server_state = 'published'` - Only published datasets
- Has preloaded `xmlCache` relationship for metadata transformation
- Optionally filtered by specific `publish_id`
### 2. **Smart Update Detection**
For each dataset, the command:
- Checks if the dataset exists in the OpenSearch index
- Compares `server_date_modified` timestamps
- Only re-indexes if the dataset is newer than the indexed version
### 3. **Document Processing**
The indexing process involves:
1. **XML Generation**: Creates structured XML from dataset metadata
2. **XSLT Transformation**: Converts XML to JSON using Saxon-JS processor
3. **Index Update**: Updates or creates the document in OpenSearch
4. **Logging**: Records success/failure for each operation
## Index Structure
### Index Configuration
- **Index Name**: `tethys-records`
- **Document ID**: Dataset `publish_id`
- **Refresh**: `true` (immediate availability)
### Document Fields
The indexed documents contain:
- **Metadata Fields**: Title, description, authors, keywords
- **Identifiers**: DOI, publish_id, and other identifiers
- **Temporal Data**: Publication dates, coverage periods
- **Geographic Data**: Spatial coverage information
- **Technical Details**: Data formats, access information
- **Timestamps**: Creation and modification dates
## Example Output
### Successful Run
```bash
node ace index:datasets
```
```
Found 150 published datasets to process
Dataset with publish_id 231 successfully indexed
Dataset with publish_id 245 is up to date, skipping indexing
Dataset with publish_id 267 successfully indexed
An error occurred while indexing dataset with publish_id 289. Error: Invalid XML metadata
Processing completed: 148 indexed, 1 skipped, 1 error
```
### Specific Dataset
```bash
node ace index:datasets --publish_id 231
```
```
Found 1 published dataset to process
Dataset with publish_id 231 successfully indexed
Processing completed: 1 indexed, 0 skipped, 0 errors
```
## Update Logic
The command uses intelligent indexing to avoid unnecessary processing:
| Condition | Action | Reason |
|-----------|--------|--------|
| Dataset not in index | ✅ Index | New dataset needs indexing |
| Dataset newer than indexed version | ✅ Re-index | Dataset has been updated |
| Dataset same/older than indexed version | ❌ Skip | Already up to date |
| OpenSearch document check fails | ✅ Index | Better safe than sorry |
| Invalid XML metadata | ❌ Skip + Log Error | Cannot process invalid data |
### Timestamp Comparison
```typescript
// Example comparison logic
const existingModified = DateTime.fromMillis(Number(existingDoc.server_date_modified) * 1000);
const currentModified = dataset.server_date_modified;
if (currentModified <= existingModified) {
// Skip - already up to date
return false;
}
// Proceed with indexing
```
## XML Transformation Process
### 1. **XML Generation**
```xml
<?xml version="1.0" encoding="UTF-8" standalone="true"?>
<root>
<Dataset>
<!-- Dataset metadata fields -->
<title>Research Dataset Title</title>
<description>Dataset description...</description>
<!-- Additional metadata -->
</Dataset>
</root>
```
### 2. **XSLT Processing**
The command uses Saxon-JS with a compiled stylesheet (`solr.sef.json`) to transform XML to JSON:
```javascript
const result = await SaxonJS.transform({
stylesheetText: proc,
destination: 'serialized',
sourceText: xmlString,
});
```
### 3. **Final JSON Document**
```json
{
"id": "231",
"title": "Research Dataset Title",
"description": "Dataset description...",
"authors": ["Author Name"],
"server_date_modified": 1634567890,
"publish_id": 231
}
```
## Configuration Requirements
### Environment Variables
```bash
# OpenSearch Configuration
OPENSEARCH_HOST=localhost:9200
# For production:
# OPENSEARCH_HOST=your-opensearch-cluster:9200
```
### Required Files
- **XSLT Stylesheet**: `public/assets2/solr.sef.json` - Compiled Saxon-JS stylesheet for XML transformation
### Database Relationships
The command expects these model relationships:
```typescript
// Dataset model must have:
@hasOne(() => XmlCache, { foreignKey: 'dataset_id' })
public xmlCache: HasOne<typeof XmlCache>
```
## Error Handling
The command handles various error scenarios gracefully:
### Common Errors and Solutions
| Error | Cause | Solution |
|-------|-------|----------|
| `XSLT transformation failed` | Invalid XML or missing stylesheet | Check XML structure and stylesheet path |
| `OpenSearch connection error` | Service unavailable | Verify OpenSearch is running and accessible |
| `JSON parse error` | Malformed transformation result | Check XSLT stylesheet output format |
| `Missing xmlCache relationship` | Data integrity issue | Ensure xmlCache exists for dataset |
### Error Logging
```bash
# Typical error log entry
An error occurred while indexing dataset with publish_id 231.
Error: XSLT transformation failed: Invalid XML structure at line 15
```
## Performance Considerations
### Batch Processing
- Processes datasets sequentially to avoid overwhelming OpenSearch
- Each dataset is committed individually for reliability
- Failed indexing of one dataset doesn't stop processing others
### Resource Usage
- **Memory**: XML/JSON transformations require temporary memory
- **Network**: OpenSearch API calls for each dataset
- **CPU**: XSLT transformations are CPU-intensive
### Optimization Tips
```bash
# Index only recently modified datasets (run regularly)
node ace index:datasets
# Index specific datasets when needed
node ace index:datasets --publish_id 231
# Consider running during off-peak hours for large batches
```
## Integration with Other Systems
### Search Functionality
The indexed documents power:
- **Dataset Search**: Full-text search across metadata
- **Faceted Browsing**: Filter by authors, keywords, dates
- **Geographic Search**: Spatial query capabilities
- **Auto-complete**: Suggest dataset titles and keywords
### Related Commands
- [`update:datacite`](update-datacite.md) - Often run after indexing to sync DOI metadata
- **Database migrations** - May require re-indexing after schema changes
### API Integration
The indexed data is consumed by:
- **Search API**: `/api/search` endpoints
- **Browse API**: `/api/datasets` with filtering
- **Recommendations**: Related dataset suggestions
## Monitoring and Maintenance
### Regular Tasks
```bash
# Daily indexing (recommended cron job)
0 2 * * * cd /path/to/project && node ace index:datasets
# Weekly full re-index (if needed)
0 3 * * 0 cd /path/to/project && node ace index:datasets --force
```
### Health Checks
- Monitor OpenSearch cluster health
- Check for failed indexing operations in logs
- Verify search functionality is working
- Compare dataset counts between database and index
### Troubleshooting
```bash
# Check specific dataset indexing
node ace index:datasets --publish_id 231
# Verify OpenSearch connectivity
curl -X GET "localhost:9200/_cluster/health"
# Check index statistics
curl -X GET "localhost:9200/tethys-records/_stats"
```
## Best Practices
1. **Regular Scheduling**: Run the command regularly (daily) to keep the search index current
2. **Monitor Logs**: Watch for transformation errors or OpenSearch issues
3. **Backup Strategy**: Include OpenSearch indices in backup procedures
4. **Resource Management**: Monitor OpenSearch cluster resources during bulk operations
5. **Testing**: Verify search functionality after major indexing operations
6. **Coordination**: Run indexing before DataCite updates when both are needed

View file

@ -0,0 +1,216 @@
# DataCite Update Command
AdonisJS Ace command for updating DataCite DOI records for published datasets.
## Overview
The `update:datacite` command synchronizes your local dataset metadata with DataCite DOI records. It intelligently compares modification dates to only update records when necessary, reducing unnecessary API calls and maintaining data consistency.
## Command Syntax
```bash
node ace update:datacite [options]
```
## Options
| Flag | Alias | Description |
|------|-------|-------------|
| `--publish_id <number>` | `-p` | Update a specific dataset by publish_id |
| `--force` | `-f` | Force update all records regardless of modification date |
| `--dry-run` | `-d` | Preview what would be updated without making changes |
| `--stats` | `-s` | Show detailed statistics for datasets that need updating |
## Usage Examples
### Basic Operations
```bash
# Update all datasets that have been modified since their DOI was last updated
node ace update:datacite
# Update a specific dataset
node ace update:datacite --publish_id 231
node ace update:datacite -p 231
# Force update all datasets with DOIs (ignores modification dates)
node ace update:datacite --force
```
### Preview and Analysis
```bash
# Preview what would be updated (dry run)
node ace update:datacite --dry-run
# Show detailed statistics for datasets that need updating
node ace update:datacite --stats
# Show stats for a specific dataset
node ace update:datacite --stats --publish_id 231
```
### Combined Options
```bash
# Dry run for a specific dataset
node ace update:datacite --dry-run --publish_id 231
# Show stats for all datasets (including up-to-date ones)
node ace update:datacite --stats --force
```
## Command Modes
### 1. **Normal Mode** (Default)
Updates DataCite records for datasets that have been modified since their DOI was last updated.
**Example Output:**
```
Using DataCite API: https://api.test.datacite.org
Found 50 datasets to process
Dataset 231: Successfully updated DataCite record
Dataset 245: Up to date, skipping
Dataset 267: Successfully updated DataCite record
DataCite update completed. Updated: 15, Skipped: 35, Errors: 0
```
### 2. **Dry Run Mode** (`--dry-run`)
Shows what would be updated without making any changes to DataCite.
**Use Case:** Preview updates before running the actual command.
**Example Output:**
```
Dataset 231: Would update DataCite record (dry run)
Dataset 267: Would update DataCite record (dry run)
Dataset 245: Up to date, skipping
DataCite update completed. Updated: 2, Skipped: 1, Errors: 0
```
### 3. **Stats Mode** (`--stats`)
Shows detailed information for each dataset that needs updating, including why it needs updating.
**Use Case:** Debug synchronization issues, monitor dataset/DOI status, generate reports.
**Example Output:**
```
┌─ Dataset 231 ─────────────────────────────────────────────────────────
│ DOI Value: 10.21388/tethys.231
│ DOI Status (DB): findable
│ DOI State (DataCite): findable
│ Dataset Modified: 2024-09-15T10:30:00.000Z
│ DOI Modified: 2024-09-10T08:15:00.000Z
│ Needs Update: YES - Dataset newer than DOI
└───────────────────────────────────────────────────────────────────────
┌─ Dataset 267 ─────────────────────────────────────────────────────────
│ DOI Value: 10.21388/tethys.267
│ DOI Status (DB): findable
│ DOI State (DataCite): findable
│ Dataset Modified: 2024-09-18T14:20:00.000Z
│ DOI Modified: 2024-09-16T12:45:00.000Z
│ Needs Update: YES - Dataset newer than DOI
└───────────────────────────────────────────────────────────────────────
DataCite Stats Summary: 2 datasets need updating, 48 are up to date
```
## Update Logic
The command uses intelligent update detection:
1. **Compares modification dates**: Dataset `server_date_modified` vs DOI last modification date from DataCite
2. **Validates data integrity**: Checks for missing or future dates
3. **Handles API failures gracefully**: Updates anyway if DataCite info can't be retrieved
4. **Uses dual API approach**: DataCite REST API (primary) with MDS API fallback
### When Updates Happen
| Condition | Action | Reason |
|-----------|--------|--------|
| Dataset modified > DOI modified | ✅ Update | Dataset has newer changes |
| Dataset modified ≤ DOI modified | ❌ Skip | DOI is up to date |
| Dataset date in future | ❌ Skip | Invalid data, needs investigation |
| Dataset date missing | ✅ Update | Can't determine staleness |
| DataCite API error | ✅ Update | Better safe than sorry |
| `--force` flag used | ✅ Update | Override all logic |
## Environment Configuration
Required environment variables:
```bash
# DataCite Credentials
DATACITE_USERNAME=your_username
DATACITE_PASSWORD=your_password
# API Endpoints (environment-specific)
DATACITE_API_URL=https://api.test.datacite.org # Test environment
DATACITE_SERVICE_URL=https://mds.test.datacite.org # Test MDS
DATACITE_API_URL=https://api.datacite.org # Production
DATACITE_SERVICE_URL=https://mds.datacite.org # Production MDS
# Project Configuration
DATACITE_PREFIX=10.21388 # Your DOI prefix
BASE_DOMAIN=tethys.at # Your domain
```
## Error Handling
The command handles various error scenarios:
- **Invalid modification dates**: Logs errors but continues processing other datasets
- **DataCite API failures**: Falls back to MDS API, then to safe update
- **Missing DOI identifiers**: Skips datasets without DOI identifiers
- **Network issues**: Continues with next dataset after logging error
## Integration
The command integrates with:
- **Dataset Model**: Uses `server_date_modified` for change detection
- **DatasetIdentifier Model**: Reads DOI values and status
- **OpenSearch Index**: Updates search index after DataCite update
- **DoiClient**: Handles all DataCite API interactions
## Common Workflows
### Daily Maintenance
```bash
# Update any datasets modified today
node ace update:datacite
```
### Pre-Deployment Check
```bash
# Check what would be updated before deployment
node ace update:datacite --dry-run
```
### Debugging Sync Issues
```bash
# Investigate why specific dataset isn't syncing
node ace update:datacite --stats --publish_id 231
```
### Full Resync
```bash
# Force update all DOI records (use with caution)
node ace update:datacite --force
```
### Monitoring Report
```bash
# Generate sync status report
node ace update:datacite --stats > datacite-sync-report.txt
```
## Best Practices
1. **Regular Updates**: Run daily or after bulk dataset modifications
2. **Test First**: Use `--dry-run` or `--stats` before bulk operations
3. **Monitor Logs**: Check for data integrity warnings
4. **Environment Separation**: Use correct API URLs for test vs production
5. **Rate Limiting**: The command handles DataCite rate limits automatically

989
package-lock.json generated

File diff suppressed because it is too large Load diff

174
readme.md
View file

@ -11,6 +11,8 @@ Welcome to the Tethys Research Repository Backend System! This is the backend co
- [Configuration](#configuration) - [Configuration](#configuration)
- [Database](#database) - [Database](#database)
- [API Documentation](#api-documentation) - [API Documentation](#api-documentation)
- [Commands](#commands)
- [Documentation](#documentation)
- [Contributing](#contributing) - [Contributing](#contributing)
- [License](#license) - [License](#license)
@ -29,5 +31,175 @@ Before you begin, ensure you have met the following requirements:
1. Clone this repository: 1. Clone this repository:
```bash ```bash
git clone https://gitea.geologie.ac.at/geolba/tethys.backend.git git clone git clone https://gitea.geologie.ac.at/geolba/tethys.backend.git
cd tethys-backend
``` ```
2. Install dependencies:
```bash
npm install
```
3. Configure environment variables (see [Configuration](#configuration))
4. Run database migrations:
```bash
node ace migration:run
```
5. Start the development server:
```bash
npm run dev
```
## Usage
The Tethys Backend provides RESTful APIs for managing research datasets, user authentication, DOI registration, and search functionality.
## Configuration
Copy the `.env.example` file to `.env` and configure the following variables:
### Database Configuration
```bash
DB_CONNECTION=pg
DB_HOST=localhost
DB_PORT=5432
DB_USER=your_username
DB_PASSWORD=your_password
DB_DATABASE=tethys_db
```
### DataCite Configuration
```bash
# DataCite Credentials
DATACITE_USERNAME=your_datacite_username
DATACITE_PASSWORD=your_datacite_password
DATACITE_PREFIX=10.21388
# Environment-specific API endpoints
DATACITE_API_URL=https://api.test.datacite.org # Test environment
DATACITE_SERVICE_URL=https://mds.test.datacite.org # Test MDS
# For production:
# DATACITE_API_URL=https://api.datacite.org
# DATACITE_SERVICE_URL=https://mds.datacite.org
```
### OpenSearch Configuration
```bash
OPENSEARCH_HOST=localhost:9200
```
### Application Configuration
```bash
BASE_DOMAIN=tethys.at
APP_KEY=your_app_key
```
## Database
The system uses PostgreSQL with Lucid ORM. Key models include:
- **Dataset**: Research dataset metadata
- **DatasetIdentifier**: DOI and other identifiers for datasets
- **User**: User management and authentication
- **XmlCache**: Cached XML metadata
Run migrations and seeders:
```bash
# Run migrations
node ace migration:run
# Run seeders (if available)
node ace db:seed
```
## API Documentation
API endpoints are available for:
- Dataset management (`/api/datasets`)
- User authentication (`/api/auth`)
- DOI registration (`/api/doi`)
- Search functionality (`/api/search`)
*Detailed API documentation can be found in the `/docs/api` directory.*
## Commands
The system includes several Ace commands for maintenance and data management:
### Dataset Indexing
```bash
# Index all published datasets to OpenSearch
node ace index:datasets
# Index a specific dataset
node ace index:datasets --publish_id 123
```
### DataCite DOI Management
```bash
# Update DataCite records for modified datasets
node ace update:datacite
# Show detailed statistics for datasets needing updates
node ace update:datacite --stats
# Preview what would be updated (dry run)
node ace update:datacite --dry-run
# Force update all DOI records
node ace update:datacite --force
# Update a specific dataset
node ace update:datacite --publish_id 123
```
*For detailed command documentation, see the [Commands Documentation](docs/commands/)*
## Documentation
Comprehensive documentation is available in the `/docs` directory:
- **[Commands Documentation](docs/commands/)** - Detailed guides for Ace commands
- [DataCite Update Command](docs/commands/update-datacite.md) - DOI synchronization and management
- [Dataset Indexing Command](docs/commands/index-datasets.md) - Search index management
- **[API Documentation](docs/api/)** - REST API endpoints and usage
- **[Deployment Guide](docs/deployment/)** - Production deployment instructions
- **[Configuration Guide](docs/configuration/)** - Environment setup and configuration options
## Contributing
1. Fork the repository
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
4. Push to the branch (`git push origin feature/amazing-feature`)
5. Open a Pull Request
### Development Guidelines
- Follow the existing code style and conventions
- Write tests for new features
- Update documentation for any API changes
- Ensure all commands and migrations work properly
### Testing Commands
```bash
# Run tests
npm test
# Test specific commands
node ace update:datacite --dry-run --publish_id 123
node ace index:datasets --publish_id 123
```
## License
This project is licensed under the [MIT License](LICENSE).

View file

@ -8,14 +8,24 @@ import AvatarController from '#controllers/Http/Api/AvatarController';
import UserController from '#controllers/Http/Api/UserController'; import UserController from '#controllers/Http/Api/UserController';
import CollectionsController from '#controllers/Http/Api/collections_controller'; import CollectionsController from '#controllers/Http/Api/collections_controller';
import { middleware } from '../kernel.js'; import { middleware } from '../kernel.js';
// API
// Clean DOI URL routes (no /api prefix)
// API routes with /api prefix
router router
.group(() => { .group(() => {
router.get('clients', [UserController, 'getSubmitters']).as('client.index').use(middleware.auth());; router.get('clients', [UserController, 'getSubmitters']).as('client.index').use(middleware.auth());
router.get('authors', [AuthorsController, 'index']).as('author.index').use(middleware.auth());; router.get('authors', [AuthorsController, 'index']).as('author.index').use(middleware.auth());
router.get('datasets', [DatasetController, 'index']).as('dataset.index'); router.get('datasets', [DatasetController, 'index']).as('dataset.index');
router.get('persons', [AuthorsController, 'persons']).as('author.persons'); router.get('persons', [AuthorsController, 'persons']).as('author.persons');
// This should come BEFORE any other routes that might conflict
router
.get('/dataset/:prefix/:value', [DatasetController, 'findByIdentifier'])
.where('prefix', /^10\.\d+$/) // Match DOI prefix pattern (10.xxxx)
.where('value', /^[a-zA-Z0-9._-]+\.[0-9]+(?:\.[0-9]+)*$/) // Match DOI suffix pattern
.as('dataset.findByIdentifier');
router.get('/dataset', [DatasetController, 'findAll']).as('dataset.findAll'); router.get('/dataset', [DatasetController, 'findAll']).as('dataset.findAll');
router.get('/dataset/:publish_id', [DatasetController, 'findOne']).as('dataset.findOne'); router.get('/dataset/:publish_id', [DatasetController, 'findOne']).as('dataset.findOne');
router.get('/sitelinks/:year', [HomeController, 'findDocumentsPerYear']); router.get('/sitelinks/:year', [HomeController, 'findDocumentsPerYear']);
@ -35,7 +45,7 @@ router
.as('apps.twofactor_backupcodes.create') .as('apps.twofactor_backupcodes.create')
.use(middleware.auth()); .use(middleware.auth());
router.get('collections/:id', [CollectionsController, 'show']).as('collection.show') router.get('collections/:id', [CollectionsController, 'show']).as('collection.show');
}) })
// .namespace('App/Controllers/Http/Api') // .namespace('App/Controllers/Http/Api')
.prefix('api'); .prefix('api');

View file

@ -1,7 +1,7 @@
/* /*
|-------------------------------------------------------------------------- |--------------------------------------------------------------------------
| Preloaded File - node ace make:preload rules/orcid | Preloaded File - node ace make:preload rules/orcid
| Do you want to register the preload file in .adonisrc.ts file? (y/N) · true | Do you want to register the preload file in .adonisrc.ts file? (y/N) · true
| DONE: create start/rules/orcid.ts | DONE: create start/rules/orcid.ts
| DONE: update adonisrc.ts file | DONE: update adonisrc.ts file
|-------------------------------------------------------------------------- |--------------------------------------------------------------------------