URI_Replacement/find_and_replace_uri.R
2021-09-02 11:08:10 +02:00

112 lines
2.7 KiB
R

library("readxl")
library("writexl")
library("dplyr")
library("xlsx")
library("stringi")
library("SPARQL")
library("stringr")
library("tictoc")
source("../sparql.R")
###### Konfiguration #########
#Arbeitsverzeichnis
setwd("C:/Users/linchr/ownCloud/GIT/URI_Replacement")
#Funktionen einbinden
source("sparql.R")
#Datei in der ersetzt werden soll
FILENAME_EXCEL <- "GC3D_Limits_ref.xlsx"
PFAD_EXCEL <- "data/"
#Spaltennummer in denen ersetzt werden soll im Excel
SPALTEN <- c(2, 3, 4, 5, 6)
#Spaltennummer mit der verglichen wird aus Datenbank (citations)
INHALTE_DB <- 3
#Spaltennummer der Inhalte die eingesetzt werden soll (uris)
ID <- 1
##############################
### Einlesen ###
inDB <- get_current_geoera_lit_db()
to_replace_original <- read_excel(paste(PFAD_EXCEL, FILENAME_EXCEL, sep=""))
to_replace_done <- to_replace_original
###
#Schleife über den Spaltenvektor aus der Excel
for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
#Start der Zeitmessung für aktuelle Spalte
tic(paste("starte mit Spalte",spalte_excel))
for (zeile_excel in 1:nrow(to_replace_original[, spalte_excel])) {
current_excel <- tolower(to_replace_original[zeile_excel, spalte_excel])
if (is.na(current_excel)) {
next
}
if ((nchar(current_excel) < 20)) {
to_replace_done[zeile_excel, spalte_excel] <- ""
next
}
excel_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +"))
excel_search_numbers <- str_extract_all(current_excel, "\\(?[0-9]+\\)?")
for (zeile_db in 1:nrow(inDB[, INHALTE_DB])) {
current_db <- tolower(inDB[zeile_db,INHALTE_DB])
db_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +"))
db_search_numbers <- str_extract_all(current_db, "\\(?[0-9]+\\)?")
count_all <- 0
count_numbers <- 0
for (k in 1:length(excel_search_all)) {
if (excel_search_all[k] %in% db_search_all) {
count_all <- count_all + 1
}
}
for (l in 1:length(excel_search_numbers)) {
if (excel_search_numbers[l] %in% db_search_numbers) {
count_numbers <- count_numbers + 1
}
}
percent_match_all <- count_all / length(excel_search_all)
percent_match_numbers <- count_numbers / length(excel_search_numbers)
if ((percent_match_all > 0.80) && (percent_match_numbers > 0.99)) {
to_replace_done[zeile_excel, spalte_excel] <- str_remove_all(inDB[zeile_db,ID], "[<>]")
}
}
}
toc()
}
toc()
pfad_output <- paste(PFAD_EXCEL, "replaced_" , FILENAME_EXCEL, sep="")
write_xlsx(to_replace_done, pfad_output)