- added earliestPublicationDate for App/Models/Dataset.ts
All checks were successful
CI Pipeline / japa-tests (push) Successful in 49s

- new classes TokenWorkerService.ts, TokenWorker.ts and ResumptionToken.ts for using REDIS with paging OAI results
- deletd public/asstes2/langCodeMap.xml: integrated it directly in datasetxml2oai-pmh.xslt
- added redis npm package
- added TokenWorkerProvider.ts for using singleton of TokenWorkerService inside OaiController.ts
- added config/oai.ts for oai related configs from .env-file
- adapted XmlModel.ts for grting domDocument from database
This commit is contained in:
Kaimbacher 2023-10-03 21:11:02 +02:00
parent 2a7480d2ed
commit 7915f66dd6
16 changed files with 691 additions and 89 deletions

View file

@ -13,8 +13,15 @@ import { OaiErrorCodes, OaiModelError } from 'App/Exceptions/OaiErrorCodes';
import { OaiModelException, BadOaiModelException } from 'App/Exceptions/OaiModelException';
import Dataset from 'App/Models/Dataset';
import Collection from 'App/Models/Collection';
import { getDomain } from 'App/Utils/utility-functions';
import { getDomain, preg_match } from 'App/Utils/utility-functions';
import XmlModel from 'App/Library/XmlModel';
import Logger from '@ioc:Adonis/Core/Logger';
import ResumptionToken from 'App/Library/Oai/ResumptionToken';
import { ModelQueryBuilderContract } from '@ioc:Adonis/Lucid/Orm';
import Config from '@ioc:Adonis/Core/Config';
import { inject } from '@adonisjs/fold';
// import { TokenWorkerContract } from "MyApp/Models/TokenWorker";
import TokenWorkerContract from 'App/Library/Oai/TokenWorker';
interface XslTParameter {
[key: string]: any;
@ -24,12 +31,19 @@ interface Dictionary {
[index: string]: string;
}
interface ListParameter {
cursor: number;
totalIds: number;
start: number;
reldocIds: (number | null)[];
metadataPrefix: string;
}
@inject(['App/Library/Oai/TokenWorkerContract'])
export default class OaiController {
private deliveringDocumentStates = ['published', 'deleted'];
// private sampleRegEx = /^[A-Za-zäüÄÜß0-9\-_.!~]+$/;
private sampleRegEx = /^[A-Za-zäüÄÜß0-9\-_.!~]+$/;
private xsltParameter: XslTParameter;
// private configuration: Configuration;
// private tokenWorker: TokenWorker;
/**
* Holds xml representation of document information to be processed.
@ -39,13 +53,9 @@ export default class OaiController {
private xml: XMLBuilder;
private proc;
constructor() {
constructor(public tokenWorker: TokenWorkerContract) {
// Load the XSLT file
this.proc = readFileSync('public/assets2/datasetxml2oai.sef.json');
// tests
// const xslPath = 'assets/datasetxml2oai-pmh.xslt'; // Replace with the actual path to your XSLT file
// this.proc = readFileSync(xslPath, 'utf-8');
// this.configuration = new Configuration();
dayjs.extend(utc);
dayjs.extend(timezone);
}
@ -66,8 +76,15 @@ export default class OaiController {
xsltParameter['oai_error_code'] = 'unknown';
xsltParameter['oai_error_message'] = 'Only POST and GET methods are allowed for OAI-PMH.';
}
let earliestDateFromDb;
// const oaiRequest: OaiParameter = request.body;
try {
const firstPublishedDataset: Dataset | null = await Dataset.earliestPublicationDate();
firstPublishedDataset != null &&
(earliestDateFromDb = firstPublishedDataset.server_date_published.toFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"));
this.xsltParameter['earliestDatestamp'] = earliestDateFromDb;
// start the request
await this.handleRequest(oaiRequest, request);
} catch (error) {
if (error instanceof OaiModelException) {
@ -87,7 +104,7 @@ export default class OaiController {
const xmlString = this.xml.end({ prettyPrint: true });
let xmlOutput;
let xmlOutput; // = xmlString;
try {
const result = await transform({
// stylesheetFileName: `${config.TMP_BASE_DIR}/data-quality/rules/iati.sef.json`,
@ -123,7 +140,7 @@ export default class OaiController {
this.xsltParameter['unixTimestamp'] = now.unix();
// set OAI base url
const baseDomain = process.env.BASE_DOMAIN || 'localhost';
const baseDomain = process.env.OAI_BASE_DOMAIN || 'localhost';
this.xsltParameter['baseURL'] = baseDomain + '/oai';
this.xsltParameter['repURL'] = request.protocol() + '://' + request.hostname();
this.xsltParameter['downloadLink'] = request.protocol() + '://' + request.hostname() + '/file/download/';
@ -139,13 +156,11 @@ export default class OaiController {
this.handleListMetadataFormats();
} else if (verb == 'GetRecord') {
await this.handleGetRecord(oaiRequest);
}
// else if (verb == "ListRecords") {
// await this.handleListRecords(oaiRequest);
// } else if (verb == "ListIdentifiers") {
// await this.handleListIdentifiers(oaiRequest);
// }
else if (verb == 'ListSets') {
} else if (verb == 'ListRecords') {
await this.handleListRecords(oaiRequest);
} else if (verb == 'ListIdentifiers') {
await this.handleListIdentifiers(oaiRequest);
} else if (verb == 'ListSets') {
await this.handleListSets();
} else {
this.handleIllegalVerb();
@ -197,7 +212,7 @@ export default class OaiController {
const sets: { [key: string]: string } = {
'open_access': 'Set for open access licenses',
'doc-type:ResearchData': 'Set for document type ResearchData',
// ...(await this.getSetsForDatasetTypes()),
...(await this.getSetsForDatasetTypes()),
...(await this.getSetsForCollections()),
// ... await this.getSetsForProjects(),
} as Dictionary;
@ -214,7 +229,13 @@ export default class OaiController {
this.xsltParameter['repIdentifier'] = repIdentifier;
const dataId = this.validateAndGetIdentifier(oaiRequest);
const dataset = await Dataset.query().where('publish_id', dataId).preload('xmlCache').preload('collections').first();
const dataset = await Dataset.query()
.where('publish_id', dataId)
.preload('xmlCache')
.preload('collections', (builder) => {
builder.preload('collectionRole');
})
.first();
if (!dataset || !dataset.publish_id) {
throw new OaiModelException(
@ -234,6 +255,229 @@ export default class OaiController {
await this.createXmlRecord(dataset, datasetNode);
}
protected async handleListIdentifiers(oaiRequest: Dictionary) {
!this.tokenWorker.isConnected && (await this.tokenWorker.connect());
const maxIdentifier: number = Config.get('oai.max.listidentifiers', 100);
await this.handleLists(oaiRequest, maxIdentifier);
}
protected async handleListRecords(oaiRequest) {
!this.tokenWorker.isConnected && (await this.tokenWorker.connect());
const maxRecords: number = Config.get('oai.max.listrecords', 100);
await this.handleLists(oaiRequest, maxRecords);
}
private async handleLists(oaiRequest: Dictionary, maxRecords: number) {
maxRecords = maxRecords || 100;
const repIdentifier = 'tethys.at';
this.xsltParameter['repIdentifier'] = repIdentifier;
const datasetNode = this.xml.root().ele('Datasets');
// list initialisation
const numWrapper: ListParameter = {
cursor: 0,
totalIds: 0,
start: maxRecords + 1,
reldocIds: [],
metadataPrefix: '',
};
// resumptionToken is defined
if ('resumptionToken' in oaiRequest) {
await this.handleResumptionToken(oaiRequest, maxRecords, numWrapper);
} else {
// no resumptionToken is given
await this.handleNoResumptionToken(oaiRequest, numWrapper);
}
// handling of document ids
const restIds = numWrapper.reldocIds as number[];
const workIds = restIds.splice(0, maxRecords) as number[]; // array_splice(restIds, 0, maxRecords);
// no records returned
if (workIds.length == 0) {
throw new OaiModelException(
StatusCodes.INTERNAL_SERVER_ERROR,
'The combination of the given values results in an empty list.',
OaiErrorCodes.NORECORDSMATCH,
);
}
const datasets: Dataset[] = await Dataset.query()
.whereIn('publish_id', workIds)
.preload('xmlCache')
.preload('collections', (builder) => {
builder.preload('collectionRole');
})
.orderBy('publish_id');
for (const dataset of datasets) {
await this.createXmlRecord(dataset, datasetNode);
}
// store the further Ids in a resumption-file
const countRestIds = restIds.length; //84
if (countRestIds > 0) {
const token = new ResumptionToken();
token.startPosition = numWrapper.start; //101
token.totalIds = numWrapper.totalIds; //184
token.documentIds = restIds; //101 -184
token.metadataPrefix = numWrapper.metadataPrefix;
// $tokenWorker->storeResumptionToken($token);
const res: string = await this.tokenWorker.set(token);
// set parameters for the resumptionToken-node
// const res = token.ResumptionId;
this.setParamResumption(res, numWrapper.cursor, numWrapper.totalIds);
}
}
private async handleResumptionToken(oaiRequest: Dictionary, maxRecords: number, numWrapper) {
const resParam = oaiRequest['resumptionToken']; //e.g. "158886496600000"
const token = await this.tokenWorker.get(resParam);
if (!token) {
throw new OaiModelException(StatusCodes.INTERNAL_SERVER_ERROR, 'cache is outdated.', OaiErrorCodes.BADRESUMPTIONTOKEN);
}
numWrapper.cursor = token.startPosition - 1; //startet dann bei Index 10
numWrapper.start = token.startPosition + maxRecords;
numWrapper.totalIds = token.totalIds;
numWrapper.reldocIds = token.documentIds;
numWrapper.metadataPrefix = token.metadataPrefix;
this.xsltParameter['oai_metadataPrefix'] = numWrapper.metadataPrefix;
}
private async handleNoResumptionToken(oaiRequest: Dictionary, numWrapper) {
// no resumptionToken is given
if ('metadataPrefix' in oaiRequest) {
numWrapper.metadataPrefix = oaiRequest['metadataPrefix'];
} else {
throw new OaiModelException(
StatusCodes.INTERNAL_SERVER_ERROR,
'The prefix of the metadata argument is unknown.',
OaiErrorCodes.BADARGUMENT,
);
}
this.xsltParameter['oai_metadataPrefix'] = numWrapper.metadataPrefix;
let finder: ModelQueryBuilderContract<typeof Dataset, Dataset> = Dataset.query();
// add server state restrictions
finder.whereIn('server_state', this.deliveringDocumentStates);
if ('set' in oaiRequest) {
const set = oaiRequest['set'] as string;
const setArray = set.split(':');
if (setArray[0] == 'data-type') {
if (setArray.length == 2 && setArray[1]) {
finder.where('type', setArray[1]);
}
} else if (setArray[0] == 'open_access') {
const openAccessLicences = ['CC-BY-4.0', 'CC-BY-SA-4.0'];
finder.andWhereHas('licenses', (query) => {
query.whereIn('name', openAccessLicences);
});
} else if (setArray[0] == 'ddc') {
if (setArray.length == 2 && setArray[1] != '') {
finder.andWhereHas('collections', (query) => {
query.where('number', setArray[1]);
});
}
}
}
// const timeZone = "Europe/Vienna"; // Canonical time zone name
// &from=2020-09-03&until2020-09-03
// &from=2020-09-11&until=2021-05-11
if ('from' in oaiRequest && 'until' in oaiRequest) {
const from = oaiRequest['from'] as string;
let fromDate = dayjs(from); //.tz(timeZone);
const until = oaiRequest['until'] as string;
let untilDate = dayjs(until); //.tz(timeZone);
if (!fromDate.isValid() || !untilDate.isValid()) {
throw new OaiModelException(StatusCodes.INTERNAL_SERVER_ERROR, 'Date Parameter is not valid.', OaiErrorCodes.BADARGUMENT);
}
fromDate = dayjs.tz(from, 'Europe/Vienna');
untilDate = dayjs.tz(until, 'Europe/Vienna');
if (from.length != until.length) {
throw new OaiModelException(
StatusCodes.INTERNAL_SERVER_ERROR,
'The request has different granularities for the from and until parameters.',
OaiErrorCodes.BADARGUMENT,
);
}
fromDate.hour() == 0 && (fromDate = fromDate.startOf('day'));
untilDate.hour() == 0 && (untilDate = untilDate.endOf('day'));
finder.whereBetween('server_date_published', [fromDate.format('YYYY-MM-DD HH:mm:ss'), untilDate.format('YYYY-MM-DD HH:mm:ss')]);
} else if ('from' in oaiRequest && !('until' in oaiRequest)) {
const from = oaiRequest['from'] as string;
let fromDate = dayjs(from);
if (!fromDate.isValid()) {
throw new OaiModelException(
StatusCodes.INTERNAL_SERVER_ERROR,
'From date parameter is not valid.',
OaiErrorCodes.BADARGUMENT,
);
}
fromDate = dayjs.tz(from, 'Europe/Vienna');
fromDate.hour() == 0 && (fromDate = fromDate.startOf('day'));
const now = dayjs();
if (fromDate.isAfter(now)) {
throw new OaiModelException(
StatusCodes.INTERNAL_SERVER_ERROR,
'Given from date is greater than now. The given values results in an empty list.',
OaiErrorCodes.NORECORDSMATCH,
);
} else {
finder.andWhere('server_date_published', '>=', fromDate.format('YYYY-MM-DD HH:mm:ss'));
}
} else if (!('from' in oaiRequest) && 'until' in oaiRequest) {
const until = oaiRequest['until'] as string;
let untilDate = dayjs(until);
if (!untilDate.isValid()) {
throw new OaiModelException(
StatusCodes.INTERNAL_SERVER_ERROR,
'Until date parameter is not valid.',
OaiErrorCodes.BADARGUMENT,
);
}
untilDate = dayjs.tz(until, 'Europe/Vienna');
untilDate.hour() == 0 && (untilDate = untilDate.endOf('day'));
const firstPublishedDataset: Dataset = (await Dataset.earliestPublicationDate()) as Dataset;
const earliestPublicationDate = dayjs(firstPublishedDataset.server_date_published.toISO()); //format("YYYY-MM-DDThh:mm:ss[Z]"));
if (earliestPublicationDate.isAfter(untilDate)) {
throw new OaiModelException(
StatusCodes.INTERNAL_SERVER_ERROR,
`earliestDatestamp is greater than given until date.
The given values results in an empty list.`,
OaiErrorCodes.NORECORDSMATCH,
);
} else {
finder.andWhere('server_date_published', '<=', untilDate.format('YYYY-MM-DD HH:mm:ss'));
}
}
let reldocIdsDocs = await finder.select('publish_id').orderBy('publish_id');
numWrapper.reldocIds = reldocIdsDocs.map((dat) => dat.publish_id);
numWrapper.totalIds = numWrapper.reldocIds.length; //212
}
private setParamResumption(res: string, cursor: number, totalIds: number) {
const tomorrow = dayjs().add(1, 'day').format('YYYY-MM-DDThh:mm:ss[Z]');
this.xsltParameter['dateDelete'] = tomorrow;
this.xsltParameter['res'] = res;
this.xsltParameter['cursor'] = cursor;
this.xsltParameter['totalIds'] = totalIds;
}
private validateAndGetIdentifier(oaiRequest: Dictionary): number {
// Identifier references metadata Urn, not plain Id!
// Currently implemented as 'oai:foo.bar.de:{docId}' or 'urn:nbn...-123'
@ -283,12 +527,12 @@ export default class OaiController {
dataset.publish_id && this.addLandingPageAttribute(domNode, dataset.publish_id.toString());
this.addSpecInformation(domNode, 'data-type:' + dataset.type);
// if (dataset.collections) {
// for (const coll of dataset.collections) {
// const collRole = await coll.getCollectionRole();
// this.addSpecInformation(domNode, collRole.oai_name + ':' + coll.number);
// }
// }
if (dataset.collections) {
for (const coll of dataset.collections) {
const collRole = coll.collectionRole;
this.addSpecInformation(domNode, collRole.oai_name + ':' + coll.number);
}
}
datasetNode.import(domNode);
}
@ -315,7 +559,7 @@ export default class OaiController {
}
private addLandingPageAttribute(domNode: XMLBuilder, dataid: string) {
const baseDomain = process.env.BASE_DOMAIN || 'localhost';
const baseDomain = process.env.OAI_BASE_DOMAIN || 'localhost';
const url = 'https://' + getDomain(baseDomain) + '/dataset/' + dataid;
// add attribute du dataset xml element
domNode.att('landingpage', url);
@ -368,26 +612,24 @@ export default class OaiController {
return sets;
}
// private async getSetsForDatasetTypes(): Promise<IDictionary> {
// const sets: { [key: string]: string } = {} as IDictionary;
private async getSetsForDatasetTypes(): Promise<Dictionary> {
const sets: { [key: string]: string } = {} as Dictionary;
// const datasets: Array<Dataset> = await Dataset.findAll({
// attributes: ["type"],
// where: { server_state: { [Sequelize.Op.eq]: "published" } },
// });
// datasets.forEach((dataset) => {
// if (dataset.type && false == preg_match(this.sampleRegEx, dataset.type)) {
// const msg = `Invalid SetSpec (data-type='${dataset.type}').
// Allowed characters are [${this.sampleRegEx}].`;
// Logger.err(`OAI: ${msg}`);
// // Log::error("OAI-PMH: $msg");
// return;
// }
// const setSpec = "data-type:" + dataset.type;
// sets[setSpec] = `Set for document type '${dataset.type}'`;
// });
// return sets;
// }
const datasets: Array<Dataset> = await Dataset.query().select('type').where('server_state', 'published');
datasets.forEach((dataset) => {
if (dataset.type && false == preg_match(this.sampleRegEx, dataset.type)) {
const msg = `Invalid SetSpec (data-type='${dataset.type}').
Allowed characters are [${this.sampleRegEx}].`;
// Log::error("OAI-PMH: $msg");
Logger.error(`OAI-PMH: ${msg}`);
return;
}
const setSpec = 'data-type:' + dataset.type;
sets[setSpec] = `Set for document type '${dataset.type}'`;
});
return sets;
}
private handleIllegalVerb() {
this.xsltParameter['oai_error_code'] = 'badVerb';

View file

@ -0,0 +1,51 @@
export default class ResumptionToken {
private _documentIds: number[] = [];
private _metadataPrefix = '';
private _resumptionId = '';
private _startPosition = 0;
private _totalIds = 0;
get key(): string {
return this.metadataPrefix + this.startPosition + this.totalIds;
}
get documentIds(): number[] {
return this._documentIds;
}
set documentIds(idsToStore: number | number[]) {
this._documentIds = Array.isArray(idsToStore) ? idsToStore : [idsToStore];
}
get metadataPrefix(): string {
return this._metadataPrefix;
}
set metadataPrefix(value: string) {
this._metadataPrefix = value;
}
get resumptionId(): string {
return this._resumptionId;
}
set resumptionId(resumptionId: string) {
this._resumptionId = resumptionId;
}
get startPosition(): number {
return this._startPosition;
}
set startPosition(startPosition: number) {
this._startPosition = startPosition;
}
get totalIds(): number {
return this._totalIds;
}
set totalIds(totalIds: number) {
this._totalIds = totalIds;
}
}

View file

@ -0,0 +1,10 @@
import ResumptionToken from './ResumptionToken';
export default interface TokenWorkerContract {
ttl: number;
isConnected: boolean;
connect();
close();
get(key: string): Promise<ResumptionToken | null>;
set(token: ResumptionToken): Promise<string>;
}

View file

@ -0,0 +1,95 @@
import ResumptionToken from './ResumptionToken';
import { createClient, RedisClientType } from 'redis';
import InternalServerErrorException from 'App/Exceptions/InternalServerException';
import { sprintf } from 'sprintf-js';
import dayjs from 'dayjs';
import TokenWorkerContract from './TokenWorker';
export default class TokenWorkerService implements TokenWorkerContract {
protected filePrefix = 'rs_';
protected fileExtension = 'txt';
private cache: RedisClientType;
public ttl: number;
private url: string;
private connected = false;
constructor(ttl: number) {
this.ttl = ttl; // time to live
this.url = process.env.REDIS_URL || 'redis://127.0.0.1:6379';
}
public async connect() {
this.cache = createClient({ url: this.url });
this.cache.on('error', (err) => {
this.connected = false;
console.log('[Redis] Redis Client Error: ', err);
});
this.cache.on('connect', () => {
this.connected = true;
});
await this.cache.connect();
}
public get isConnected(): boolean {
return this.connected;
}
public async has(key: string): Promise<boolean> {
const result = await this.cache.get(key);
return result !== undefined && result !== null;
}
public async set(token: ResumptionToken): Promise<string> {
const uniqueName = await this.generateUniqueName();
const serialToken = JSON.stringify(token);
await this.cache.setEx(uniqueName, this.ttl, serialToken);
return uniqueName;
}
private async generateUniqueName(): Promise<string> {
let fc = 0;
const uniqueId = dayjs().unix().toString();
let uniqueName: string;
let cacheKeyExists: boolean;
do {
// format values
// %s - String
// %d - Signed decimal number (negative, zero or positive)
// [0-9] (Specifies the minimum width held of to the variable value)
uniqueName = sprintf('%s%05d', uniqueId, fc++);
cacheKeyExists = await this.has(uniqueName);
} while (cacheKeyExists);
return uniqueName;
}
public async get(key: string): Promise<ResumptionToken | null> {
if (!this.cache) {
throw new InternalServerErrorException('Dataset is not available for OAI export!');
}
const result = await this.cache.get(key);
return result ? this.parseToken(result) : null;
}
private parseToken(result: string): ResumptionToken {
const rToken: ResumptionToken = new ResumptionToken();
const parsed = JSON.parse(result);
Object.assign(rToken, parsed);
return rToken;
}
public del(key: string) {
this.cache.del(key);
}
public flush() {
this.cache.flushAll();
}
public async close() {
await this.cache.disconnect();
this.connected = false;
}
}

View file

@ -3,6 +3,7 @@ import { XMLBuilder } from 'xmlbuilder2/lib/interfaces';
import Dataset from 'App/Models/Dataset';
import Strategy from './Strategy';
import { DateTime } from 'luxon';
import { builder } from 'xmlbuilder2';
/**
* This is the description of the interface
@ -84,10 +85,21 @@ export default class XmlModel {
this.cache = this.cache || new DocumentXmlCache();
this.cache.document_id = dataset.id;
this.cache.xml_version = 1; // (int)$this->strategy->getVersion();
// this.cache.server_date_modified = dataset.server_date_modified.toFormat("yyyy-MM-dd HH:mm:ss");
this.cache.server_date_modified = dataset.server_date_modified.toFormat("yyyy-MM-dd HH:mm:ss");
this.cache.xml_data = domDocument.end();
await this.cache.save();
}
const node = domDocument.find(
(n) => {
const test = n.node.nodeName == 'Rdr_Dataset';
return test;
},
false,
true,
)?.node;
if(node != undefined) {
domDocument = builder({ version: '1.0', encoding: 'UTF-8', standalone: true }, node);
}
}
return domDocument;
}

View file

@ -214,4 +214,12 @@ export default class Dataset extends DatasetExtension {
foreignKey: 'document_id',
})
public xmlCache: HasOne<typeof DocumentXmlCache>;
static async earliestPublicationDate(): Promise<Dataset | null> {
const serverState = 'published';
const model = await this.query().where('server_state', serverState).orderBy('server_date_published', 'asc').first();
return model || null;
}
}

View file

@ -19,3 +19,8 @@ export function getDomain(host: string): string {
myHost = myHost.replace(new RegExp(/^.*:\/\//i, 'g'), '');
return myHost;
}
export function preg_match(regex: RegExp, str: string) {
const result: boolean = regex.test(str);
return result;
}