From baef13eddecfe7881422619ca8edcd5438d33572 Mon Sep 17 00:00:00 2001 From: Linsberger Christian Date: Mon, 6 Sep 2021 16:02:30 +0200 Subject: [PATCH] 2 New Files, Renamed. --- 00_Setup.R | 8 +++++ find_new_entries.R => 01_find_new_entries.R | 2 +- 02_not_in_db.R | 29 +++++++++++++++++++ ...replace_uri.R => 03_find_and_replace_uri.R | 0 4 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 00_Setup.R rename find_new_entries.R => 01_find_new_entries.R (96%) create mode 100644 02_not_in_db.R rename find_and_replace_uri.R => 03_find_and_replace_uri.R (100%) diff --git a/00_Setup.R b/00_Setup.R new file mode 100644 index 0000000..efaa1cf --- /dev/null +++ b/00_Setup.R @@ -0,0 +1,8 @@ +#### Generiert die Ordnerstruktur der Daten #### + +source("config/config.R") + +ifelse(!dir.exists(file.path("data/")), dir.create(file.path("data/")), FALSE) +ifelse(!dir.exists(file.path(PFAD_EXCEL)), dir.create(file.path(PFAD_EXCEL)), FALSE) +ifelse(!dir.exists(file.path(PFAD_OUT)), dir.create(file.path(PFAD_OUT)), FALSE) +ifelse(!dir.exists(file.path(PFAD_EXCEL)), dir.create(file.path(PFAD_DB_OUT)), FALSE) \ No newline at end of file diff --git a/find_new_entries.R b/01_find_new_entries.R similarity index 96% rename from find_new_entries.R rename to 01_find_new_entries.R index 5e63b2f..ad9e139 100644 --- a/find_new_entries.R +++ b/01_find_new_entries.R @@ -83,4 +83,4 @@ distincts <- data.frame(distincts[nchar(distincts$bibliographicCitation) >= THRE colnames(distincts) <- (c('bibliographicCitation')) #Schreibt die Distincts in eine Exceldatei -write_xlsx(distincts, "data/Output/distincts_automated_gc3d.xlsx") +write_xlsx(distincts, paste(PFAD_OUT, "distincts_automated_gc3d.xlsx", sep="")) diff --git a/02_not_in_db.R b/02_not_in_db.R new file mode 100644 index 0000000..5ac9392 --- /dev/null +++ b/02_not_in_db.R @@ -0,0 +1,29 @@ +library("readxl") +library("writexl") +library("dplyr") +library("xlsx") +library("stringi") + +source("config/config.R") +source("sparql.R") + +vorhanden_df = data.frame(get_current_geoera_lit_db()$citation) +vorhanden_df_2 = unique(vorhanden_df) + +colnames(vorhanden_df) = (c(COLUMN_NAME)) + +neu_df = data.frame(read_excel(paste(PFAD_EXCEL, "distincts_kontrolliert.xlsx", sep=""))) +neu_df = data.frame(lapply(neu_df, stri_enc_toutf8)) +neu_df_2 = unique(neu_df) + +all_df = rbind(vorhanden_df, neu_df) +all_df_2 = unique(all_df) + +#inner_join finds common elements between two data frames +#anti_join finds elements the are exclusively in one of the data frames +common = inner_join(neu_df, vorhanden_df) +not_in_db = anti_join(neu_df, common) + +not_in_db_2 = anti_join(all_df, vorhanden_df) + +write_xlsx(not_in_db, paste(PFAD_OUT, "not_in_db_",format(Sys.time(), "%Y_%m_%d") ,".xlsx", sep="")) diff --git a/find_and_replace_uri.R b/03_find_and_replace_uri.R similarity index 100% rename from find_and_replace_uri.R rename to 03_find_and_replace_uri.R