diff --git a/find_and_replace_uri.R b/find_and_replace_uri.R index 532e6b3..8239d16 100644 --- a/find_and_replace_uri.R +++ b/find_and_replace_uri.R @@ -18,83 +18,84 @@ setwd("C:/Users/linchr/ownCloud/GIT/URI_Replacement") source("sparql.R") #Datei in der ersetzt werden soll -PFAD_EXCEL = "GC3D_Limits_ref.xlsx" +FILENAME_EXCEL <- "GC3D_Limits_ref.xlsx" +PFAD_EXCEL <- "data/" #Spaltennummer in denen ersetzt werden soll im Excel -SPALTEN = c(2,3,4,5,6) +SPALTEN <- c(2, 3, 4, 5, 6) #Spaltennummer mit der verglichen wird aus Datenbank (citations) -INHALTE_DB = 3 +INHALTE_DB <- 3 #Spaltennummer der Inhalte die eingesetzt werden soll (uris) -ID = 1 +ID <- 1 ############################## ### Einlesen ### -inDB = get_current_geoera_lit_db() +inDB <- get_current_geoera_lit_db() -to_replace_original = read_excel(PFAD_EXCEL) -to_replace_done = to_replace_original +to_replace_original <- read_excel(paste(PFAD_EXCEL, FILENAME_EXCEL, sep="")) +to_replace_done <- to_replace_original ### #Schleife über den Spaltenvektor aus der Excel -for(spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]){ +for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) { #Start der Zeitmessung für aktuelle Spalte tic(paste("starte mit Spalte",spalte_excel)) - for (zeile_excel in 1:nrow(to_replace_original[,spalte_excel])) { + for (zeile_excel in 1:nrow(to_replace_original[, spalte_excel])) { - current_excel = tolower(to_replace_original[zeile_excel,spalte_excel]) + current_excel <- tolower(to_replace_original[zeile_excel, spalte_excel]) - if(is.na(current_excel)){ + if (is.na(current_excel)) { next } - if((nchar(current_excel) < 20)){ - to_replace_done[zeile_excel, spalte_excel] = "" + if ((nchar(current_excel) < 20)) { + to_replace_done[zeile_excel, spalte_excel] <- "" next } - excel_search_all = unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +")) - excel_search_numbers = str_extract_all(current_excel,"\\(?[0-9]+\\)?") + excel_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +")) + excel_search_numbers <- str_extract_all(current_excel, "\\(?[0-9]+\\)?") - for (zeile_db in 1:nrow(inDB[,INHALTE_DB])) { + for (zeile_db in 1:nrow(inDB[, INHALTE_DB])) { - current_db = tolower(inDB[zeile_db,INHALTE_DB]) + current_db <- tolower(inDB[zeile_db,INHALTE_DB]) - db_search_all = unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +")) - db_search_numbers = str_extract_all(current_db,"\\(?[0-9]+\\)?") + db_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +")) + db_search_numbers <- str_extract_all(current_db, "\\(?[0-9]+\\)?") - count_all = 0 - count_numbers = 0 + count_all <- 0 + count_numbers <- 0 - for(k in 1:length(excel_search_all)){ - if(excel_search_all[k] %in% db_search_all){ + for (k in 1:length(excel_search_all)) { + if (excel_search_all[k] %in% db_search_all) { - count_all = count_all + 1 + count_all <- count_all + 1 } } - for(l in 1:length(excel_search_numbers)){ - if(excel_search_numbers[l] %in% db_search_numbers){ + for (l in 1:length(excel_search_numbers)) { + if (excel_search_numbers[l] %in% db_search_numbers) { - count_numbers =+ count_numbers + 1 + count_numbers <- count_numbers + 1 } } - percent_match_all = count_all/ length(excel_search_all) - percent_match_numbers = count_numbers/ length(excel_search_numbers) + percent_match_all <- count_all / length(excel_search_all) + percent_match_numbers <- count_numbers / length(excel_search_numbers) - if((percent_match_all > 0.80) && (percent_match_numbers > 0.99)){ + if ((percent_match_all > 0.80) && (percent_match_numbers > 0.99)) { - to_replace_done[zeile_excel, spalte_excel] = str_remove_all(inDB[zeile_db,ID],"[<>]") + to_replace_done[zeile_excel, spalte_excel] <- str_remove_all(inDB[zeile_db,ID], "[<>]") } @@ -105,7 +106,7 @@ for(spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]){ toc() -pfad_output = paste("replaced_",PFAD_EXCEL, sep="") +pfad_output <- paste(PFAD_EXCEL, "replaced_" , FILENAME_EXCEL, sep="") write_xlsx(to_replace_done, pfad_output) diff --git a/sparql.R b/sparql.R index 93c2c13..8834133 100644 --- a/sparql.R +++ b/sparql.R @@ -5,10 +5,10 @@ library("stringi") read_current_geoera_lit_db <- function() { - ## http verwenden - https funktioniert nicht ## - endpoint = "http://resource.geolba.ac.at/PoolParty/sparql/geoera" +## http verwenden - https funktioniert nicht ## +endpoint = "http://resource.geolba.ac.at/PoolParty/sparql/geoera" - query = +query <- "PREFIX skos: PREFIX dcterms: select * @@ -21,18 +21,18 @@ order by ?L " -qd <- SPARQL(endpoint,query) +qd <- SPARQL(endpoint, query) inDB <- qd$results -for(j in 1:length(inDB[1,])){ +for (j in 1:length (inDB[1, ])) { - for(i in 1:length(inDB[,j])){ - Encoding(inDB[i,j]) = "UTF-8" + for (i in 1:length(inDB[, j])) { + Encoding(inDB[i, j]) <- "UTF-8" } } -pfad = paste("inDB-",format(Sys.Date(), "%Y_%m_%d"),".xlsx", sep="") +pfad <- paste("data/inDB-", format(Sys.Date(), "%Y_%m_%d"), ".xlsx", sep="") write_xlsx(inDB, pfad) @@ -41,12 +41,13 @@ write_xlsx(inDB, pfad) get_current_geoera_lit_db <- function() { - pfad = paste("inDB-",format(Sys.Date(), "%Y_%m_%d"),".xlsx", sep="") + pfad <- paste("data/inDB-", format(Sys.Date(), "%Y_%m_%d"), ".xlsx", sep="") - if(file.exists(pfad)){ - inDB = read_excel(pfad) + if (file.exists(pfad)) { + inDB <- read_excel(pfad) return(inDB) } + else { read_current_geoera_lit_db() @@ -55,4 +56,3 @@ get_current_geoera_lit_db <- function() { } -#a change \ No newline at end of file