library("readxl") library("writexl") library("dplyr") library("xlsx") library("stringi") library("SPARQL") library("stringr") library("tictoc") source("../sparql.R") ###### Konfiguration ######### #Arbeitsverzeichnis setwd("C:/Users/linchr/ownCloud/GIT/URI_Replacement") #Funktionen einbinden source("sparql.R") #Datei in der ersetzt werden soll PFAD_EXCEL = "GC3D_Limits_ref.xlsx" #Spaltennummer in denen ersetzt werden soll im Excel SPALTEN = c(2,3,4,5,6) #Spaltennummer mit der verglichen wird aus Datenbank (citations) INHALTE_DB = 3 #Spaltennummer der Inhalte die eingesetzt werden soll (uris) ID = 1 ############################## ### Einlesen ### inDB = get_current_geoera_lit_db() to_replace_original = read_excel(PFAD_EXCEL) to_replace_done = to_replace_original ### #Schleife über den Spaltenvektor aus der Excel for(spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]){ #Start der Zeitmessung für aktuelle Spalte tic(paste("starte mit Spalte",spalte_excel)) for (zeile_excel in 1:nrow(to_replace_original[,spalte_excel])) { current_excel = tolower(to_replace_original[zeile_excel,spalte_excel]) if(is.na(current_excel)){ next } if((nchar(current_excel) < 20)){ to_replace_done[zeile_excel, spalte_excel] = "" next } excel_search_all = unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +")) excel_search_numbers = str_extract_all(current_excel,"\\(?[0-9]+\\)?") for (zeile_db in 1:nrow(inDB[,INHALTE_DB])) { current_db = tolower(inDB[zeile_db,INHALTE_DB]) db_search_all = unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +")) db_search_numbers = str_extract_all(current_db,"\\(?[0-9]+\\)?") count_all = 0 count_numbers = 0 for(k in 1:length(excel_search_all)){ if(excel_search_all[k] %in% db_search_all){ count_all = count_all + 1 } } for(l in 1:length(excel_search_numbers)){ if(excel_search_numbers[l] %in% db_search_numbers){ count_numbers =+ count_numbers + 1 } } percent_match_all = count_all/ length(excel_search_all) percent_match_numbers = count_numbers/ length(excel_search_numbers) if((percent_match_all > 0.80) && (percent_match_numbers > 0.99)){ to_replace_done[zeile_excel, spalte_excel] = str_remove_all(inDB[zeile_db,ID],"[<>]") } } } toc() } toc() pfad_output = paste("replaced_",PFAD_EXCEL, sep="") write_xlsx(to_replace_done, pfad_output)