library("readxl") library("writexl") library("dplyr") library("xlsx") library("stringi") library("SPARQL") library("stringr") library("tictoc") library("rlang") #Funktionen einbinden source("sparql.R") source("config/config.R") ### Einlesen ### inDB <- get_current_geoera_lit_db() to_replace_original <- read_excel(paste(PFAD_EXCEL, FILENAME_EXCEL, sep="")) to_replace_done <- to_replace_original ### #Schleife über den Spaltenvektor aus der Excel for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) { #Start der Zeitmessung für aktuelle Spalte tic(paste("starte mit Spalte",spalte_excel)) for (zeile_excel in 1:nrow(to_replace_original[, spalte_excel])) { current_excel <- tolower(to_replace_original[zeile_excel, spalte_excel]) if (is.na(current_excel)) { next } if ((nchar(current_excel) < 20)) { to_replace_done[zeile_excel, spalte_excel] <- "" next } excel_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +")) excel_search_numbers <- unlist(str_extract_all(current_excel, "(?:19|20)\\d{2}")) for (zeile_db in 1:nrow(inDB[, INHALTE_DB])) { current_db <- tolower(inDB[zeile_db,INHALTE_DB]) db_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +")) db_search_numbers <- unlist(str_extract_all(current_db, "(?:19|20)\\d{2}")) count_all <- 0 count_numbers <- 0 percent_match_numbers <- 0.1 for (k in 1:length(excel_search_all)) { if (excel_search_all[k] %in% db_search_all) { count_all <- count_all + 1 } } if ((length(db_search_numbers) > 0) && (length(excel_search_numbers) > 0)) { for (l in 1:length(excel_search_numbers)) { if (excel_search_numbers[l] %in% db_search_numbers) { count_numbers <- count_numbers + 1 } } } percent_match_all <- count_all / length(excel_search_all) if (length(excel_search_numbers) > 0) { percent_match_numbers <- count_numbers / length(excel_search_numbers) } if ((percent_match_all > 0.80) && (percent_match_numbers > 0.99)) { to_replace_done[zeile_excel, spalte_excel] <- str_remove_all(inDB[zeile_db,ID], "[<>]") } } } toc() } toc() pfad_output <- paste(PFAD_OUT, "replaced_", format(Sys.time(), "%Y_%m_%d_%H%M%S"), FILENAME_EXCEL, sep="") write_xlsx(to_replace_done, pfad_output)