diff --git a/URI_Replacement.Rproj b/URI_Replacement.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/URI_Replacement.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/data/GC3D_Limits_ref.xlsx b/data/GC3D_Limits_ref.xlsx new file mode 100644 index 0000000..9a7e5b6 Binary files /dev/null and b/data/GC3D_Limits_ref.xlsx differ diff --git a/data/GC3D_Units_ref.xlsx b/data/GC3D_Units_ref.xlsx new file mode 100644 index 0000000..d0cff0f Binary files /dev/null and b/data/GC3D_Units_ref.xlsx differ diff --git a/data/inDB-2021_09_01.xlsx b/data/inDB-2021_09_01.xlsx new file mode 100644 index 0000000..818f8c8 Binary files /dev/null and b/data/inDB-2021_09_01.xlsx differ diff --git a/find_and_replace_uri.R b/find_and_replace_uri.R new file mode 100644 index 0000000..532e6b3 --- /dev/null +++ b/find_and_replace_uri.R @@ -0,0 +1,111 @@ +library("readxl") +library("writexl") +library("dplyr") +library("xlsx") +library("stringi") +library("SPARQL") +library("stringr") +library("tictoc") + +source("../sparql.R") + +###### Konfiguration ######### + +#Arbeitsverzeichnis +setwd("C:/Users/linchr/ownCloud/GIT/URI_Replacement") + +#Funktionen einbinden +source("sparql.R") + +#Datei in der ersetzt werden soll +PFAD_EXCEL = "GC3D_Limits_ref.xlsx" + +#Spaltennummer in denen ersetzt werden soll im Excel +SPALTEN = c(2,3,4,5,6) + +#Spaltennummer mit der verglichen wird aus Datenbank (citations) +INHALTE_DB = 3 + +#Spaltennummer der Inhalte die eingesetzt werden soll (uris) +ID = 1 + +############################## + + +### Einlesen ### +inDB = get_current_geoera_lit_db() + +to_replace_original = read_excel(PFAD_EXCEL) +to_replace_done = to_replace_original + +### + + + +#Schleife über den Spaltenvektor aus der Excel +for(spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]){ + + #Start der Zeitmessung für aktuelle Spalte + tic(paste("starte mit Spalte",spalte_excel)) + + for (zeile_excel in 1:nrow(to_replace_original[,spalte_excel])) { + + current_excel = tolower(to_replace_original[zeile_excel,spalte_excel]) + + if(is.na(current_excel)){ + next + } + + if((nchar(current_excel) < 20)){ + to_replace_done[zeile_excel, spalte_excel] = "" + next + } + + excel_search_all = unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +")) + excel_search_numbers = str_extract_all(current_excel,"\\(?[0-9]+\\)?") + + for (zeile_db in 1:nrow(inDB[,INHALTE_DB])) { + + current_db = tolower(inDB[zeile_db,INHALTE_DB]) + + db_search_all = unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +")) + db_search_numbers = str_extract_all(current_db,"\\(?[0-9]+\\)?") + + count_all = 0 + count_numbers = 0 + + for(k in 1:length(excel_search_all)){ + if(excel_search_all[k] %in% db_search_all){ + + count_all = count_all + 1 + } + } + + for(l in 1:length(excel_search_numbers)){ + if(excel_search_numbers[l] %in% db_search_numbers){ + + count_numbers =+ count_numbers + 1 + } + } + + percent_match_all = count_all/ length(excel_search_all) + percent_match_numbers = count_numbers/ length(excel_search_numbers) + + if((percent_match_all > 0.80) && (percent_match_numbers > 0.99)){ + + + to_replace_done[zeile_excel, spalte_excel] = str_remove_all(inDB[zeile_db,ID],"[<>]") + + } + + } + } + toc() +} + +toc() + +pfad_output = paste("replaced_",PFAD_EXCEL, sep="") + +write_xlsx(to_replace_done, pfad_output) + diff --git a/sparql.R b/sparql.R index 21fb851..93c2c13 100644 --- a/sparql.R +++ b/sparql.R @@ -53,4 +53,6 @@ get_current_geoera_lit_db <- function() { get_current_geoera_lit_db() } -} \ No newline at end of file +} + +#a change \ No newline at end of file