import { Controller, Get } from "@overnightjs/core"; import Sequelize from "sequelize"; import { NextFunction, Request, Response } from "express"; import { StatusCodes } from "http-status-codes"; import { create } from "xmlbuilder2"; import { XMLBuilder } from "xmlbuilder2/lib/interfaces"; import { readFileSync } from "fs"; // @ts-ignore import { transform } from "saxon-js"; import dayjs, { Dayjs, OpUnitType } from "dayjs"; import { Dataset, Project, License } from "../models/init-models"; import Logger from "jet-logger"; import { BadOaiModelException, OaiModelException } from "../exceptions/OaiModelException"; import PageNotFoundException from "../exceptions/PageNotFoundException"; import { OaiErrorCodes } from "../exceptions/OaiErrorCodes"; import XmlModel from "../library/XmlModel"; import Configuration from "../library/oai/OaiConfiguration"; import ResumptionToken from "../library/oai/ResumptionToken"; import TokenWorker from "../library/oai/TokenWorker"; interface XslTParameter { [key: string]: any; } interface OaiParameter { [key: string]: any; } interface IDictionary { [index: string]: string; } function preg_match(regex: RegExp, str: string) { const result: boolean = regex.test(str); return result; } @Controller("oai") export class OaiController { private deliveringDocumentStates = ["published", "deleted"]; private sampleRegEx = /^[A-Za-zäüÄÜß0-9\-_.!~]+$/; private xsltParameter: XslTParameter; private configuration: Configuration; private tokenWorker: TokenWorker; /** * Holds xml representation of document information to be processed. * * @var xmlbuilder.XMLDocument | null Defaults to null. */ private xml: XMLBuilder; private proc; constructor() { this.proc = readFileSync(__dirname + "/datasetxml2oai.sef.json"); this.configuration = new Configuration(); } @Get("") public async index(request: Request, response: Response, next: NextFunction) { this.xml = create( { version: "1.0", encoding: "UTF-8", standalone: true }, "", // { // keepNullNodes: false, // keepNullAttributes: false, // headless: false, // ignoreDecorators: false, // separateArrayItems: false, // noDoubleEncoding: false, // noValidation: false, // invalidCharReplacement: undefined, // stringify: {}, // }, ); // this.proc = new XSLTProcessor(); // const stylesheet = readFileSync(__dirname + "/datasetxml2oai.sef.json"); const xsltParameter = (this.xsltParameter = {}); let earliestDateFromDb; const firstPublishedDataset: Dataset | null = await Dataset.earliestPublicationDate(); firstPublishedDataset != null && (earliestDateFromDb = dayjs(firstPublishedDataset.server_date_published).format("YYYY-MM-DDThh:mm:ss[Z]")); this.xsltParameter["earliestDatestamp"] = earliestDateFromDb; const oaiRequest: OaiParameter = request.query; try { await this.handleRequest(oaiRequest, request); } catch (error) { // return next(error); if (error instanceof OaiModelException) { this.xsltParameter["oai_error_code"] = error.oaiCode; this.xsltParameter["oai_error_message"] = error.message; } else { // return next(error); // passing to default express middleware error handler this.xsltParameter["oai_error_code"] = "unknown"; this.xsltParameter["oai_error_message"] = "An internal error occured."; } } // catch (error) { // manually catching // return next(error); // passing to default express middleware error handler // } const xmlString = this.xml.end({ prettyPrint: true }); // let data = await transform({ // stylesheetText: stylesheet, // // stylesheetBaseURI: "my-stylesheet.sef.json", // sourceText: xmlString, // destination: "serialized" // }); // .then((data: any) => { // response.writeHead(200, {'Content-Type': 'application/xml'}); // response.write(data.principalResult); // response.end(); // }); let xmlOutput; try { const result = await transform({ // stylesheetFileName: `${config.TMP_BASE_DIR}/data-quality/rules/iati.sef.json`, stylesheetText: this.proc, destination: "serialized", // sourceFileName: sourceFile, sourceText: xmlString, stylesheetParams: xsltParameter, // logLevel: 10, }); xmlOutput = result.principalResult; } catch (error) { // return next(error); // if (error instanceof OaiModelException) { // this.xsltParameter["oai_error_code"] = error.oaiCode; // this.xsltParameter["oai_error_message"] = error.message; // } else { // // return next(error); // passing to default express middleware error handler // this.xsltParameter["oai_error_code"] = "unknown"; // this.xsltParameter["oai_error_message"] = "An internal error occured."; // } return next(error); } response .header("Content-Type", "application/xml") .header("Access-Control-Allow-Origin", "*") .header("Access-Control-Allow-Methods", "GET,POST"); response.status(StatusCodes.OK).send(xmlOutput); // response.end(); } protected async handleRequest(oaiRequest: OaiParameter, request: Request) { // Setup stylesheet // $this->loadStyleSheet('datasetxml2oai-pmh.xslt'); // Set response time const now: dayjs.Dayjs = dayjs(); this.xsltParameter["responseDate"] = now.format("YYYY-MM-DDThh:mm:ss[Z]"); this.xsltParameter["unixTimestamp"] = now.unix(); // set OAI base url const baseDomain = process.env.BASE_DOMAIN || "localhost"; this.xsltParameter["baseURL"] = baseDomain + "/oai"; this.xsltParameter["repURL"] = request.protocol + "://" + request.get("host"); this.xsltParameter["downloadLink"] = request.protocol + "://" + request.get("host") + "/file/download/"; this.xsltParameter["doiLink"] = "https://doi.org/"; this.xsltParameter["doiPrefix"] = "info:eu-repo/semantics/altIdentifier/doi/"; if (oaiRequest["verb"]) { const verb = oaiRequest["verb"]; this.xsltParameter["oai_verb"] = verb; if (verb == "Identify") { this.handleIdentify(); } else if (verb == "ListMetadataFormats") { this.handleListMetadataFormats(); } else if (verb == "GetRecord") { await this.handleGetRecord(oaiRequest); } else if (verb == "ListRecords") { await this.handleListRecords(oaiRequest); } else if (verb == "ListIdentifiers") { await this.handleListIdentifiers(oaiRequest); } else if (verb == "ListSets") { await this.handleListSets(); } else { this.handleIllegalVerb(); } } else { // const err = new HttpException(404, 'Not Found') // next(err); // try { // console.log("Async code example.") // const err = new HttpException(404, 'Not Found'); const err = new PageNotFoundException("verb not found"); throw err; // } catch (error) { // manually catching // next(error); // passing to default middleware error handler // } } } protected handleIdentify() { const email = "repository@geologie.ac.at"; const repositoryName = "Tethys RDR"; const repIdentifier = "tethys.at"; const sampleIdentifier = "oai:" + repIdentifier + ":1"; //$this->_configuration->getSampleIdentifier(); // Dataset::earliestPublicationDate()->server_date_published->format('Y-m-d\TH:i:s\Z') : null; // earliestDateFromDb!= null && (this.xsltParameter['earliestDatestamp'] = earliestDateFromDb?.server_date_published); // set parameters for oai-pmh.xslt this.xsltParameter["email"] = email; this.xsltParameter["repositoryName"] = repositoryName; this.xsltParameter["repIdentifier"] = repIdentifier; this.xsltParameter["sampleIdentifier"] = sampleIdentifier; // $this->proc->setParameter('', 'earliestDatestamp', $earliestDateFromDb); this.xml.root().ele("Datasets"); } /** * Implements response for OAI-PMH verb 'ListMetadataFormats'. * * @param array &$oaiRequest Contains full request information * @return void */ protected handleListMetadataFormats() { this.xml.root().ele("Datasets"); } protected async handleListSets() { const repIdentifier = "tethys.at"; this.xsltParameter["repIdentifier"] = repIdentifier; const datasetElement = this.xml.root().ele("Datasets"); const sets: { [key: string]: string } = { open_access: "Set for open access licenses", // 'bibliography:true' => 'Set for bibliographic entries', // 'bibliography:false' => 'Set for non-bibliographic entries', ...(await this.getSetsForDatasetTypes()), // ... await this.getSetsForProjects(), } as IDictionary; for (const [key, value] of Object.entries(sets)) { const setElement = datasetElement.ele("Rdr_Sets"); setElement.att("Type", key); setElement.att("TypeName", value); } } protected async handleGetRecord(oaiRequest: OaiParameter) { // GetRecord&metadataPrefix=oai_dc&identifier=oai:tethys.at:1 const repIdentifier = "tethys.at"; this.xsltParameter["repIdentifier"] = repIdentifier; // Identifier references metadata Urn, not plain Id! // Currently implemented as 'oai:foo.bar.de:{docId}' or 'urn:nbn...-123' if (!("identifier" in oaiRequest)) { // throw new BadOaiModelException('The prefix of the identifier argument is unknown.'); throw new BadOaiModelException("The prefix of the identifier argument is unknown."); } const dataId = Number(this.getDocumentIdByIdentifier(oaiRequest.identifier)); // let dataset: Dataset | null; const dataset = await Dataset.findOne({ where: { publish_id: dataId }, include: ["xmlCache"], // order: ['server_date_published'], }); if (!dataset || !dataset.publish_id) { throw new OaiModelException( StatusCodes.INTERNAL_SERVER_ERROR, "The value of the identifier argument is unknown or illegal in this repository.", OaiErrorCodes.IDDOESNOTEXIST, ); } let metadataPrefix = null; if ("metadataPrefix" in oaiRequest) { metadataPrefix = oaiRequest["metadataPrefix"]; } else { throw new OaiModelException( StatusCodes.INTERNAL_SERVER_ERROR, "The prefix of the metadata argument is unknown.", OaiErrorCodes.BADARGUMENT, ); } this.xsltParameter["oai_metadataPrefix"] = metadataPrefix; // do not deliver datasets which are restricted by document state if (dataset.server_state == null || !this.deliveringDocumentStates.includes(dataset.server_state)) { throw new OaiModelException( StatusCodes.INTERNAL_SERVER_ERROR, "Document is not available for OAI export!", OaiErrorCodes.NORECORDSMATCH, ); } // add xml elements const datasetNode = this.xml.root().ele("Datasets"); await this.createXmlRecord(dataset, datasetNode); // let domNode = await this.getDatasetXmlDomNode(dataset); // // add frontdoor url // // dataset.publish_id = dataset.publish_id != null ? dataset.publish_id : 0; // this.addLandingPageAttribute(domNode, dataset.publish_id.toString()); // this.addSpecInformation(domNode, "data-type:" + dataset.type); // datasetNode.import(domNode); } /** * Implements response for OAI-PMH verb 'ListRecords'. * * @param array &$oaiRequest Contains full request information * @return void */ protected async handleListRecords(oaiRequest: OaiParameter) { if (!this.tokenWorker) { this.tokenWorker = new TokenWorker(86400); } !this.tokenWorker.Connected && (await this.tokenWorker.connect()); //$maxRecords = 30; //$this->_configuration->getMaxListRecords(); const maxRecords = this.configuration.maxListRecs; await this.handlingOfLists(oaiRequest, maxRecords); await this.tokenWorker.close(); } /** * Implements response for OAI-PMH verb 'ListIdentifiers'. * * @param array &$oaiRequest Contains full request information * @return void */ protected async handleListIdentifiers(oaiRequest: OaiParameter) { // if ("resumptionToken" in oaiRequest) { if (!this.tokenWorker) { this.tokenWorker = new TokenWorker(86400); } !this.tokenWorker.Connected && (await this.tokenWorker.connect()); //$maxIdentifier = 5; //$this->_configuration->getMaxListIdentifiers(); const maxIdentifier = this.configuration.maxListIds; //->getMaxListIdentifiers(); await this.handlingOfLists(oaiRequest, maxIdentifier); await this.tokenWorker.close(); } private async handlingOfLists(oaiRequest: OaiParameter, maxRecords: number) { if (!maxRecords) { maxRecords = 100; } const repIdentifier = "tethys.at"; // //$this->_configuration->getResumptionTokenPath(); // $tokenTempPath = storage_path('app' . DIRECTORY_SEPARATOR . 'resumption'); this.xsltParameter["repIdentifier"] = repIdentifier; const datasetNode = this.xml.root().ele("Datasets"); // // do some initialisation let cursor = 0; let totalIds = 0; let start = maxRecords + 1; let reldocIds: (number | null)[] = []; let metadataPrefix = null; // const tokenWorker = new TokenWorker(86400); // await tokenWorker.connect(); // $tokenWorker->setResumptionPath($tokenTempPath); // const url = process.env.REDIS_URL || "redis://redis:6379"; // const redisClient = createClient({ // url // }); // redisClient.on('error', (error) => { // const err = new InternalServerErrorException("Error occured while connecting or accessing redis server'"); // throw err; // }); // resumptionToken is defined if ("resumptionToken" in oaiRequest) { const resParam = oaiRequest["resumptionToken"]; //e.g. "158886496600000" // let token = await tokenWorker.getResumptionToken(resParam); const token = await this.tokenWorker.get(resParam); if (!token) { throw new OaiModelException(StatusCodes.INTERNAL_SERVER_ERROR, "cache is outdated.", OaiErrorCodes.BADRESUMPTIONTOKEN); } cursor = token.StartPosition - 1; //startet dann bei Index 10 start = token.StartPosition + maxRecords; totalIds = token.TotalIds; reldocIds = token.DocumentIds; metadataPrefix = token.MetadataPrefix; this.xsltParameter["oai_metadataPrefix"] = metadataPrefix; } else { // no resumptionToken is given if ("metadataPrefix" in oaiRequest) { metadataPrefix = oaiRequest["metadataPrefix"]; } else { throw new OaiModelException( StatusCodes.INTERNAL_SERVER_ERROR, "The prefix of the metadata argument is unknown.", OaiErrorCodes.BADARGUMENT, ); } this.xsltParameter["oai_metadataPrefix"] = metadataPrefix; // add server state restrictions const includeArray: Array = []; const andArray: Array = new Array({ server_state: { [Sequelize.Op.in]: this.deliveringDocumentStates, }, }); // andArray.push({ // server_state: { // [Sequelize.Op.in]: this.deliveringDocumentStates, // }, // }); if ("set" in oaiRequest) { const set = oaiRequest["set"] as string; const setArray = set.split(":"); if (setArray[0] == "data-type") { if (setArray.length == 2 && setArray[1]) { andArray.push({ type: { [Sequelize.Op.eq]: setArray[1], }, }); } } else if (setArray[0] == "open_access") { const openAccessLicences = ["CC-BY-4.0", "CC-BY-SA-4.0"]; let icncludeFilter = { model: License, as: "licenses", required: true, //return only records which have an associated model INNER JOIN where: { name: { [Sequelize.Op.in]: openAccessLicences, }, }, }; includeArray.push(icncludeFilter); } } // &from=2020-09-03&until2020-09-03 // &from=2020-09-11&until=2021-05-11 if ("from" in oaiRequest && "until" in oaiRequest) { const from = oaiRequest["from"] as string; let fromDate = dayjs(from); const until = oaiRequest["until"] as string; let untilDate = dayjs(until); if (from.length != until.length) { throw new OaiModelException( StatusCodes.INTERNAL_SERVER_ERROR, "The request has different granularities for the from and until parameters.", OaiErrorCodes.BADARGUMENT, ); } fromDate.hour() == 0 && (fromDate = fromDate.startOf("day")); untilDate.hour() == 0 && (untilDate = untilDate.endOf("day")); andArray.push({ server_date_published: { // [Sequelize.Op.between]: [fromDate, untilDate] [Sequelize.Op.and]: { [Sequelize.Op.gte]: fromDate.format("YYYY-MM-DD HH:mm:ss"), [Sequelize.Op.lte]: untilDate.format("YYYY-MM-DD HH:mm:ss"), }, }, }); } else if ("from" in oaiRequest && !("until" in oaiRequest)) { const from = oaiRequest["from"] as string; let fromDate = dayjs(from); fromDate.hour() == 0 && (fromDate = fromDate.startOf("day")); const now = dayjs(); if (fromDate.isAfter(now)) { throw new OaiModelException( StatusCodes.INTERNAL_SERVER_ERROR, "Given from date is greater than now. The given values results in an empty list.", OaiErrorCodes.NORECORDSMATCH, ); } else { // $finder->where('server_date_published', '>=', $fromDate); andArray.push({ server_date_published: { [Sequelize.Op.gte]: fromDate.format("YYYY-MM-DD HH:mm:ss"), }, }); } } else if (!("from" in oaiRequest) && "until" in oaiRequest) { const until = oaiRequest["until"] as string; let untilDate = dayjs(until); untilDate.hour() == 0 && (untilDate = untilDate.endOf("day")); const firstPublishedDataset: Dataset = (await Dataset.earliestPublicationDate()) as Dataset; const earliestPublicationDate = dayjs(firstPublishedDataset.server_date_published); //format("YYYY-MM-DDThh:mm:ss[Z]")); if (earliestPublicationDate.isAfter(untilDate)) { throw new OaiModelException( StatusCodes.INTERNAL_SERVER_ERROR, `earliestDatestamp is greater than given until date. The given values results in an empty list.`, OaiErrorCodes.NORECORDSMATCH, ); } else { // $finder->where('server_date_published', '<=', $untilDate); andArray.push({ server_date_published: { [Sequelize.Op.lte]: untilDate.format("YYYY-MM-DD HH:mm:ss"), }, }); } } reldocIds = ( await Dataset.findAll({ attributes: ["publish_id"], where: andArray, order: ["publish_id"], include: includeArray, raw: true, // logging: (sql, queryObject) => { // const test = sql; // }, }) ).map((dat) => dat.publish_id); // reldocIds = await Dataset.findAll({ // // attributes: ["publish_id"], // where: andArray, // include: ["xmlCache"], // order: ["server_date_published"], // // logging: (sql, queryObject) => { // // const test = sql; // // }, // }); totalIds = reldocIds.length; //184 } //else resumptionToekn // // handling of document ids const restIds = reldocIds as number[]; const workIds = restIds.splice(0, maxRecords) as number[]; // array_splice(restIds, 0, maxRecords); // no records returned if (workIds.length == 0) { // await tokenWorker.close(); throw new OaiModelException( StatusCodes.INTERNAL_SERVER_ERROR, "The combination of the given values results in an empty list.", OaiErrorCodes.NORECORDSMATCH, ); } //foreach ($datasets as $dataset) const datasets: Dataset[] = await Dataset.findAll({ // attributes: ["publish_id"], where: { publish_id: { [Sequelize.Op.in]: workIds, }, }, include: ["xmlCache"], order: ["publish_id"], }); for (const dataset of datasets) { // let dataset = Dataset.findOne({ // where: {'publish_id': dataId} // }); await this.createXmlRecord(dataset, datasetNode); } // store the further Ids in a resumption-file const countRestIds = restIds.length; //84 if (countRestIds > 0) { const token = new ResumptionToken(); token.StartPosition = start; //101 token.TotalIds = totalIds; //184 token.DocumentIds = restIds; //101 -184 token.MetadataPrefix = metadataPrefix; // $tokenWorker->storeResumptionToken($token); const res = await this.tokenWorker.set(token); // set parameters for the resumptionToken-node // const res = token.ResumptionId; this.setParamResumption(res, cursor, totalIds); } } /** * Set parameters for resumptionToken-line. * * @param string $res value of the resumptionToken * @param int $cursor value of the cursor * @param int $totalIds value of the total Ids */ private setParamResumption(res: string, cursor: number, totalIds: number) { const tomorrow = dayjs().add(1, "day").format("YYYY-MM-DDThh:mm:ss[Z]"); this.xsltParameter["dateDelete"] = tomorrow; this.xsltParameter["res"] = res; this.xsltParameter["cursor"] = cursor; this.xsltParameter["totalIds"] = totalIds; } private addSpecInformation(domNode: XMLBuilder, information: string) { domNode.ele("SetSpec").att("Value", information); } private addLandingPageAttribute(domNode: XMLBuilder, dataid: string) { const baseDomain = process.env.BASE_DOMAIN || "localhost"; const url = "https://" + this.getDomain(baseDomain) + "/dataset/" + dataid; // add attribute du dataset xml element domNode.att("landingpage", url); } private getDomain(host: string): string { // $myhost = strtolower(trim($host)); let myHost: string = host.trim().toLocaleLowerCase(); // $count = substr_count($myhost, '.'); const count: number = myHost.split(",").length - 1; if (count == 2) { const words = myHost.split("."); if (words[1].length > 3) { myHost = myHost.split(".", 2)[1]; } } else if (count > 2) { myHost = this.getDomain(myHost.split(".", 2)[1]); } myHost = myHost.replace(new RegExp(/^.*:\/\//i, "g"), ""); return myHost; } private getDocumentIdByIdentifier(oaiIdentifier: string): string { const identifierParts: string[] = oaiIdentifier.split(":"); // explode(":", $oaiIdentifier); const dataId: string = identifierParts[2]; // switch (identifierParts[0]) { // case 'oai': // if (isset($identifierParts[2])) { // $dataId = $identifierParts[2]; // } // break; // default: // throw new OaiModelException( // 'The prefix of the identifier argument is unknown.', // OaiModelError::BADARGUMENT // ); // break; // } // if (empty($dataId) or !preg_match('/^\d+$/', $dataId)) { // throw new OaiModelException( // 'The value of the identifier argument is unknown or illegal in this repository.', // OaiModelError::IDDOESNOTEXIST // ); return dataId; } private async createXmlRecord(dataset: Dataset, datasetNode: XMLBuilder) { const domNode = await this.getDatasetXmlDomNode(dataset); // add frontdoor url and data-type // if (dataset.publish_id) { dataset.publish_id && this.addLandingPageAttribute(domNode, dataset.publish_id.toString()); // } this.addSpecInformation(domNode, "data-type:" + dataset.type); datasetNode.import(domNode); } private async getDatasetXmlDomNode(dataset: Dataset) { // dataset.fetchValues(); const xmlModel = new XmlModel(dataset); // xmlModel.setModel(dataset); xmlModel.excludeEmptyFields(); // const cache = dataset.xmlCache ? dataset.xmlCache : new DocumentXmlCache(); if (dataset.xmlCache) { xmlModel.setXmlCache = dataset.xmlCache; } xmlModel.caching = true; // return cache.getDomDocument(); const domDocument = await xmlModel.getDomDocument(); return domDocument; } private async getSetsForProjects(): Promise { // const setSpecPattern = this.SET_SPEC_PATTERN; const sets: { [key: string]: string } = {} as IDictionary; const projects: Array = await Project.findAll({ attributes: ["label"], raw: true, }); projects.forEach((project) => { if (false == preg_match(this.sampleRegEx, project.label)) { const msg = `Invalid SetSpec (project='${project.label}'). Allowed characters are [${this.sampleRegEx}].`; Logger.err(`OAI: ${msg}`); // Log::error("OAI-PMH: $msg"); return; } const setSpec = "project:" + project.label; sets[setSpec] = `Set for project '${project.label}'`; }); return sets; } private async getSetsForDatasetTypes(): Promise { const sets: { [key: string]: string } = {} as IDictionary; const datasets: Array = await Dataset.findAll({ attributes: ["type"], where: { server_state: { [Sequelize.Op.eq]: "published" } }, }); datasets.forEach((dataset) => { if (dataset.type && false == preg_match(this.sampleRegEx, dataset.type)) { const msg = `Invalid SetSpec (data-type='${dataset.type}'). Allowed characters are [${this.sampleRegEx}].`; Logger.err(`OAI: ${msg}`); // Log::error("OAI-PMH: $msg"); return; } const setSpec = "data-type:" + dataset.type; sets[setSpec] = `Set for document type '${dataset.type}'`; }); return sets; } private handleIllegalVerb() { this.xsltParameter["oai_error_code"] = "badVerb"; this.xsltParameter["oai_error_message"] = "The verb provided in the request is illegal."; } }