95 lines
2.5 KiB
R
95 lines
2.5 KiB
R
library("readxl")
|
|
library("writexl")
|
|
library("dplyr")
|
|
library("xlsx")
|
|
library("stringi")
|
|
library("SPARQL")
|
|
library("stringr")
|
|
library("tictoc")
|
|
library("rlang")
|
|
|
|
#Funktionen einbinden
|
|
source("sparql.R")
|
|
source("config/config.R")
|
|
|
|
|
|
### Einlesen ###
|
|
inDB <- get_current_geoera_lit_db()
|
|
|
|
to_replace_original <- read_excel(paste(PFAD_EXCEL, FILENAME_EXCEL, sep=""))
|
|
to_replace_done <- to_replace_original
|
|
|
|
###
|
|
|
|
|
|
#Schleife über den Spaltenvektor aus der Excel
|
|
for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
|
|
|
|
#Start der Zeitmessung für aktuelle Spalte
|
|
tic(paste("starte mit Spalte",spalte_excel))
|
|
|
|
for (zeile_excel in 1:nrow(to_replace_original[, spalte_excel])) {
|
|
|
|
current_excel <- tolower(to_replace_original[zeile_excel, spalte_excel])
|
|
|
|
if (is.na(current_excel)) {
|
|
next
|
|
}
|
|
|
|
if ((nchar(current_excel) < 20)) {
|
|
to_replace_done[zeile_excel, spalte_excel] <- ""
|
|
next
|
|
}
|
|
|
|
excel_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +"))
|
|
excel_search_numbers <- unlist(str_extract_all(current_excel, "(?:19|20)\\d{2}"))
|
|
|
|
|
|
for (zeile_db in 1:nrow(inDB[, INHALTE_DB])) {
|
|
|
|
current_db <- tolower(inDB[zeile_db,INHALTE_DB])
|
|
|
|
db_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +"))
|
|
db_search_numbers <- unlist(str_extract_all(current_db, "(?:19|20)\\d{2}"))
|
|
|
|
count_all <- 0
|
|
count_numbers <- 0
|
|
percent_match_numbers <- 0.1
|
|
|
|
for (k in 1:length(excel_search_all)) {
|
|
if (excel_search_all[k] %in% db_search_all) {
|
|
|
|
count_all <- count_all + 1
|
|
}
|
|
}
|
|
|
|
if ((length(db_search_numbers) > 0) && (length(excel_search_numbers) > 0)) {
|
|
for (l in 1:length(excel_search_numbers)) {
|
|
if (excel_search_numbers[l] %in% db_search_numbers) {
|
|
|
|
count_numbers <- count_numbers + 1
|
|
}
|
|
}
|
|
}
|
|
|
|
percent_match_all <- count_all / length(excel_search_all)
|
|
|
|
if (length(excel_search_numbers) > 0) {
|
|
percent_match_numbers <- count_numbers / length(excel_search_numbers)
|
|
}
|
|
|
|
if ((percent_match_all > 0.80) && (percent_match_numbers > 0.99)) {
|
|
|
|
to_replace_done[zeile_excel, spalte_excel] <- str_remove_all(inDB[zeile_db,ID], "[<>]")
|
|
}
|
|
|
|
}
|
|
}
|
|
toc()
|
|
}
|
|
|
|
toc()
|
|
|
|
pfad_output <- paste(PFAD_OUT, "replaced_", format(Sys.time(), "%Y_%m_%d_%H%M%S"), FILENAME_EXCEL, sep="")
|
|
write_xlsx(to_replace_done, pfad_output)
|
|
|