tethys.frontend/src/services/dataset.service.ts

import api from "../api/api";
import { Observable } from "rxjs";
import { tap, map } from "rxjs/operators";
import { Dataset, DbDataset, Suggestion } from "@/models/dataset";
import { HitHighlight, OpenSearchResponse, SolrResponse } from "@/models/headers";
import { ActiveFilterCategories } from "@/models/solr";
import { VUE_API } from "@/constants";
import { deserialize } from "class-transformer";

class DatasetService {
    /**
     * Search datasets with OpenSearch API, allowing for fuzzy search and boosting relevance in title, author, and subject fields.
     * @param {string} searchTerm - Search query term
     * @param {string} openCore - The OpenSearch core to search in
     * @param {string} openHost - The OpenSearch host URL
     * @returns {Observable} - Observable emitting datasets and their highlights
     */
    public searchTerm(term: string, openCore: string, openHost: string): Observable<{ datasets: Dataset[], highlights: HitHighlight[] }> {

        const host = openHost; // OpenSearch host URL
        const path = "/" + openCore + "/_search"; // API endpoint for searching
        const base = host + path; // Complete URL for the request
        /**
         * The match query used for title, author, and subjects fields is case-insensitive by default. The standard analyzer is typically used, which lowercases the terms.
         * The wildcard query is case-sensitive by default. To make it case-insensitive, it is needed to use a lowercase filter */
        const lowercaseTerm = term.toLowerCase(); // Lowercase the search term

        // Request body defining search query logic
        const body = {
            query: {
                bool: {
                    should: [
                        { match: { title: { query: term, fuzziness: "AUTO", boost: 3 } } },
                        { match: { author: { query: term, fuzziness: "AUTO", boost: 2 } } },
                        { match: { subjects: { query: term, fuzziness: "AUTO", boost: 1 } } }, // In SOLR is "subject"!
                        { match: { doctype: { query: term, fuzziness: "AUTO", boost: 1 } } }, // doctype
                        { wildcard: { title: { value: `${lowercaseTerm}*`, boost: 3 } } },
                        { wildcard: { author: { value: `${lowercaseTerm}*`, boost: 2 } } },
                        { wildcard: { subjects: { value: `${lowercaseTerm}*`, boost: 1 } } }, // In SOLR is "subject"!
                        { wildcard: { doctype: { value: `${lowercaseTerm}*`, boost: 1 } } } // doctype
                    ],
                    minimum_should_match: 1 // Require at least one match
                }
            },
            size: 10, // Limit to 10 results
            from: 0, // Pagination: start from the first result
            sort: [{ _score: { order: "desc" } }], // Sort by relevance (_score)
            // sort: [{ server_date_published: { order: "desc" } }],
            track_scores: true, // This ensures "_score" is included even when sorting by other criteria. Otherwise the relevance score is not calculated
            aggs: {
                subjects: { terms: { field: "subjects.keyword", size: 1000 } }, // In SOLR is "subject"!
                language: { terms: { field: "language" } },  // << ".keyword" HAS TO BE REMOVED. OTHERWISE BUCKETS ARE NOT OBTAINED FOR THIS
                author: { terms: { field: "author.keyword", size: 1000 } },
                year: { terms: { field: "year", size: 100 } },  // << ".keyword" HAS TO BE REMOVED. OTHERWISE BUCKETS ARE NOT OBTAINED FOR THIS
                doctype: { terms: { field: "doctype", size: 50 } }  // << ".keyword" HAS TO BE REMOVED. OTHERWISE BUCKETS ARE NOT OBTAINED FOR THIS
            },
            highlight: {
                fields: {
                    title: {}, // Highlight matching terms in title
                    author: {}, // Highlight matching terms in author
                    subjects: {}, // Highlight matching terms in subjects
                    doctype: {} // Highlight matching terms in document type
                }
            }
        };

        /**
         * Make API call to OpenSearch and return the result
         * When a POST request is made to the OpenSearch server using the api.post<OpenSearchResponse> method, the response received from OpenSearch is an object that includes various details about the search results.
         * One of the key properties of this response object is _source, which is an array of documents (datasets) that match the search criteria.
         * It is used the pipe method to chain RxJS operators to the Observable returned by api.get. The map operator is used to transform the emitted items of the Observable.
         */
        return api.post<OpenSearchResponse>(base, body).pipe(
            map(response => ({
                datasets: response.hits.hits.map(hit => hit._source),
                highlights: response.hits.hits.map(hit => hit.highlight)
            }))
        );
    }

    // // For the autocomplete search. Method to perform a search based on a term
    // public searchTermSOLR(term: string, solrCore: string, solrHost: string): Observable<Dataset[]> {
    //     // SOLR endpoint
    //     const host = "https://" + solrHost;
    //     const path = "/solr/" + solrCore + "/select?";
    //     const base = host + path;

    //     //const fields = 'id,server_date_published,abstract_output,title_output,title_additional,author,subject'; // fields we want returned
    //     const fields = [
    //         "id",
    //         "licence",
    //         "server_date_published",
    //         "abstract_output",
    //         "title_output",
    //         "title_additional",
    //         "author",
    //         "subject",
    //         "doctype",
    //     ].toString();

    //     const qfFields = "title^3 author^2 subject^1";

    //     const q_params = {
    //         "0": "fl=" + fields,
    //         q: term + "*",
    //         defType: "edismax",
    //         qf: qfFields,
    //         indent: "on",
    //         wt: "json",
    //     };

    //     // Make API call to Solr and return the result
    //     /**
    //      * When a GET request is made to the Solr server using the api.get<SolrResponse> method, the response received from Solr is an object that includes various details about the search results.
    //      * One of the key properties of this response object is docs, which is an array of documents (datasets) that match the search criteria.
    //      * It is used the pipe method to chain RxJS operators to the Observable returned by api.get. The map operator is used to transform the emitted items of the Observable.
    //      */
    //     const stations = api.get<SolrResponse>(base, q_params).pipe(map((res: SolrResponse) => res.response.docs));

    //     return stations;
    // }


    /**
     * Perform faceted search with OpenSearch API using filters and suggestions
     * @param {Suggestion | string} suggestion - Search term or suggestion
     * @param {ActiveFilterCategories} activeFilterCategories - Active filters to apply
     * @param {string} openCore - The OpenSearch core to search in
     * @param {string} openHost - The OpenSearch host URL
     * @param {string} start - Optional: starting page
     * @returns {Observable<OpenSearchResponse>} - Observable emitting search results
     */
    public facetedSearch(
        suggestion: Suggestion | string,
        activeFilterCategories: ActiveFilterCategories,
        openCore: string,
        openHost: string,
        start?: string, // Starting page
    ): Observable<OpenSearchResponse> {
        const host = openHost;
        const path = "/" + openCore + "/_search";
        const base = host + path;

        const lowercaseTerm = typeof suggestion === 'string' ? suggestion.toLowerCase() : suggestion.value;

        /**
         * The query construction depends on whether the suggestion is a string or a Suggestion object.
         * */

        // When suggestion is a string:
        const mainQuery = typeof suggestion === 'string'
            ? {
                bool: {
                    should: [
                        { match: { title: { query: suggestion, fuzziness: "AUTO", boost: 3 } } },
                        { match: { author: { query: suggestion, fuzziness: "AUTO", boost: 2 } } },
                        { match: { subjects: { query: suggestion, fuzziness: "AUTO", boost: 1 } } },
                        { match: { doctype: { query: suggestion, fuzziness: "AUTO", boost: 1 } } },
                        { wildcard: { title: { value: `${lowercaseTerm}*`, boost: 3 } } },
                        { wildcard: { author: { value: `${lowercaseTerm}*`, boost: 2 } } },
                        { wildcard: { subjects: { value: `${lowercaseTerm}*`, boost: 1 } } },
                        { wildcard: { doctype: { value: `${lowercaseTerm}*`, boost: 1 } } }
                    ],
                    minimum_should_match: 1
                }
            }
            // When suggestion is a suggestion object:
            : {
                match: {
                    [suggestion.type]: {
                        query: suggestion.value,
                        operator: 'and' // all the terms in the query must be present in the field
                    }
                }
            };

        // Build filters based on the active filter categories
        const filters = Object.entries(activeFilterCategories).map(([category, values]) => {
            if (category === "language" || category === "year" || category === "doctype") {
                return values.map(value => ({ term: { [category]: value  } }));
            } else {
                return values.map(value => ({ term: { [`${category}.keyword`]: value } }));
            }
        }).flat();

        // Request body for the faceted search
        const body = {
            query: {
                bool: {
                    must: [
                        mainQuery, // Ensure the main query must be satisfied
                        ...filters // Ensure all filters must be satisfied
                    ]
                }
            },

            size: 10,
            from: start ? parseInt(start) : 0,
            sort: [{ server_date_published: { order: "desc" } }], // Sort by publication date
            // sort: [{ _score: { order: "desc" } }], // Sort by _score in descending order
            track_scores: true,
            /**
             *  Defines aggregations for facets
             * terms: Aggregation type that returns the most common terms in a field.
             * !For a large number of terms setting an extremely large size might not be efficient
             * If you genuinely need all unique terms and expect a large number of them, consider using a composite aggregation for more efficient pagination of terms.
             */
            aggs: {
                subjects: { terms: { field: "subjects.keyword", size: 1000 } }, // In SOLR is "subject"!
                language: { terms: { field: "language" } }, // ".keyword" HAS TO BE REMOVED. OTHERWISE BUCKETS ARE NOT OBTAINED FOR THIS
                author: { terms: { field: "author.keyword", size: 1000 } },
                year: { terms: { field: "year", size: 100 } },  // ".keyword" HAS TO BE REMOVED. OTHERWISE BUCKETS ARE NOT OBTAINED FOR THIS
                doctype: { terms: { field: "doctype", size: 50 } }  // ".keyword" HAS TO BE REMOVED. OTHERWISE BUCKETS ARE NOT OBTAINED FOR THIS
            },
            highlight: {
                fields: {
                    title: {},
                    author: {},
                    subjects: {},
                    doctype: {}
                }
            }
        };

        // API call and return observable of search results
        const stations = api.post<OpenSearchResponse>(base, body);
        return stations;
    }

    // /**
    //  * This method performs a faceted search on a Solr core. Faceted search allows the user to filter search results based on various categories (facets)
    //  */
    // public facetedSearchSOLR(
    //     suggestion: Suggestion | string,
    //     activeFilterCategories: ActiveFilterCategories,
    //     solrCore: string,
    //     solrHost: string,
    //     start?: string, // Starting page
    // ): Observable<SolrResponse> {
    //     // console.log("face:", suggestion);
    //     // console.log(activeFilterCategories);
    //     // console.log(solrCore);
    //     // console.log(solrHost);
    //     // console.log(start);

    //     console.log("facetedsearchSOLR > suggestion entered:");
    //     console.log(suggestion);

    //     // Construct Solr query parameters
    //     const host = "https://" + solrHost;
    //     const path = "/solr/" + solrCore + "/select?";
    //     const base = host + path;

    //     const fields = [
    //         "id",
    //         "licence",
    //         "server_date_published",
    //         "abstract_output",
    //         "identifier",
    //         "title_output",
    //         "title_additional",
    //         "author",
    //         "subject",
    //         "doctype",
    //     ].toString();

    //     // Determine search term, query operator, and query fields based on the suggestion type. Depending on whether suggestion is a string or a Suggestion object, it constructs the search term and query fields differently.
    //     let term, queryOperator, qfFields;
    //     if (typeof suggestion === "string") { // f suggestion is a string, it appends a wildcard (*) for partial matches.
    //         term = suggestion + "*";
    //         queryOperator = "or";
    //         qfFields = "title^3 author^2 subject^1";
    //     } else if (suggestion instanceof Suggestion) { // If suggestion is a Suggestion object, it forms a more specific query based on the type and value of the suggestion.
    //         term = suggestion.type + ':"' + suggestion.value + '"';
    //         queryOperator = "and";
    //         qfFields = undefined;
    //     }

    //     // Set default value for start if not provided
    //     if (start === undefined) start = "0";

    //     // Construct filter fields based on active filter categories
    //     const filterFields = new Array<string>();
    //     if (Object.keys(activeFilterCategories).length > 0) {
    //         /* Declare variable prop with a type that is a key of the activeFilterCategories. The 'keyof typeof' activeFilterCategories type represents all possible keys
    //         that can exist on the activeFilterCategories --> prop can only be assigned a value that is a key of the activeFilterCategories object */
    //         let prop: keyof typeof activeFilterCategories;
    //         for (prop in activeFilterCategories) {
    //             const filterItems = activeFilterCategories[prop];
    //             filterItems.forEach(function (value: string) {
    //                 filterFields.push(prop + ':("' + value + '")');
    //                 // e.g. Array [ 'subject:("Vektordaten")', 'author:("GeoSphere Austria, ")' ]
    //             });
    //         }
    //     }

    //     // https://solr.apache.org/guide/8_4/json-request-api.html
    //     // Construct Solr query parameters
    //     const q_params = {
    //         "0": "fl=" + fields,
    //         q: term,
    //         "q.op": queryOperator,
    //         defType: "edismax",
    //         qf: qfFields,
    //         // df: "title",
    //         indent: "on",
    //         wt: "json",
    //         rows: 10,
    //         // fq: ["subject:Steiermark", "language:de"],
    //         fq: filterFields,
    //         start: start,
    //         sort: "server_date_published desc",
    //         facet: "on",
    //         // "facet.field": "language",
    //         "json.facet.language": '{ type: "terms", field: "language" }',
    //         "json.facet.subject": '{ type: "terms", field: "subject", limit: -1 }',
    //         "json.facet.year": '{ type: "terms", field: "year" }',
    //         "json.facet.author": '{ type: "terms", field: "author_facet", limit: -1 }',
    //     };
    //     /* E.g.
    //     {"0":"fl=id,licence,server_date_published,abstract_output,identifier,title_output,title_additional,author,subject,doctype","q":"*","q.op":"or","defType":"edismax",
    //     "qf":"title^3 author^2 subject^1",
    //     "indent":"on","wt":"json","rows":10,
    //     "fq":["subject:(\"Vektordaten\")","author:(\"GeoSphere Austria, \")"],
    //     "start":"0","sort":"server_date_published desc","facet":"on",
    //     "json.facet.language":"{ type: \"terms\", field: \"language\" }",
    //     "json.facet.subject":"{ type: \"terms\", field: \"subject\", limit: -1 }",
    //     "json.facet.year":"{ type: \"terms\", field: \"year\" }",
    //     "json.facet.author":"{ type: \"terms\", field: \"author_facet\", limit: -1 }"}
    //      */
    //     // console.log(JSON.stringify(q_params));

    //     // Make API call to Solr and return the result
    //     const stations = api.get<SolrResponse>(base, q_params);

    //     return stations;
    // }

    // Method to fetch years
    public getYears(): Observable<string[]> {
        const host = VUE_API;
        const path = "/api/years";
        const base = host + path;

        const years = api.get<string[]>(base);
        return years;
    }

    // Method to fetch documents for a specific year
    public getDocuments(year: string): Observable<Array<DbDataset>> {
        const host = VUE_API;
        const path = "/api/sitelinks/" + year;
        const base = host + path;

        const documents: Observable<DbDataset[]> = api.get<Array<DbDataset>>(base);
        return documents;
    }

    // Method to fetch a dataset by its ID
    public getDataset(id: number): Observable<DbDataset> {
        const host = VUE_API;
        const path = "/api/dataset/" + id;
        const apiUrl = host + path;

        const dataset = api.get<DbDataset>(apiUrl).pipe(map((res) => this.prepareDataset(res)));
        return dataset;
    }

    // Method to fetch a dataset by its DOI
    public getDatasetByDoi(doi: string): Observable<DbDataset> {
        const host = VUE_API;
        const path = "/api/dataset/10.24341/tethys." + doi;
        const apiUrl = host + path;

        const dataset = api.get<DbDataset>(apiUrl).pipe(map((res) => this.prepareDataset(res)));
        return dataset;
    }

    // Prepare dataset object by deserializing it and adding a URL
    private prepareDataset(datasetObj: DbDataset): DbDataset {

        const dataset = deserialize<DbDataset>(DbDataset, JSON.stringify(datasetObj));
        dataset.url = document.documentURI;
        return dataset;
    }
}

export default new DatasetService();