Comments & more Config

2021-09-06 15:27:49 +02:00 · 2021-09-06 15:27:49 +02:00 · 8a1fa79065
commit 8a1fa79065
parent 6a8966a263
3 changed files with 46 additions and 22 deletions
--- a/config/config.R
+++ b/config/config.R
@ -6,9 +6,18 @@ PFAD_EXCEL <- "data/Input/"
 PFAD_OUT <- "data/Output/"
 PFAD_DB_OUT <- "data/Database/"
 #Liste der Dateien in denen die Distincts gesucht werden
 FIND_DISTINCTS_FILES = c("GC3D_Limits_ref.xlsx", "GC3D_Units_ref.xlsx")
 #Datei in der ersetzt werden soll
 FILENAME_EXCEL <-  "project vocabulary_BB.xlsx"
 #Name des Excelsheets in der ersetzt werden soll
 SHEETNAME <- "Projektvokabular"
 #Spaltennamen(Muster) in denen die Distincts gewerden
 COLUMN_NAME <- "bibliographicCitation"
 #Spaltennummer in denen ersetzt werden soll im Excel
 SPALTEN <- c(10, 11, 12, 13)
--- a/find_and_replace_uri.R
+++ b/find_and_replace_uri.R
@ -15,10 +15,9 @@ source("config/config.R")
 ### Einlesen ###
 inDB <-  get_current_geoera_lit_db()
-to_replace_original <- read_excel(paste(PFAD_EXCEL, FILENAME_EXCEL, sep=""), sheet = 1)
+to_replace_original <- read_excel(paste(PFAD_EXCEL, FILENAME_EXCEL, sep=""), sheet = SHEETNAME)
 to_replace_done <- to_replace_original
 #Erste Schleife über den Spaltenvektor aus der Excel
 for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) { 
--- a/find_new_entries.R
+++ b/find_new_entries.R
@ -5,66 +5,82 @@ library("xlsx")
 source("config/config.R")
 #Übernimmt die Liste der zu durchsuchenden Excelfiles aus der Konfiguration
 file_list <- FIND_DISTINCTS_FILES
-#select all *.xlsx files inside the Input-Folder and put them into a list
+#Legt eine leere List zum Befüllen an
 file_list <- list.files(PFAD_EXCEL, "*.xlsx")
 my_table <- c()
 #Liest die Excelfiles ein
 for (file in 1:length(file_list)) {
  temp_pfad <- paste(PFAD_EXCEL, file_list[file], sep="")
  my_table[[file]] <- read_excel(temp_pfad)
 }
-
+#Wählt die Spalten aus in denen die Distincts gesucht werden
 #select the bibliographicCitation columns in all tables
 for(spalte in 1:length(my_table))  {
  #Akutelle Spalte in die Liste
  df_all <- data.frame(my_table[[spalte]])
  #Liefert die Spaltenname die der Vorgabe (COLUMN_NAME) entsprechen
  Names <- colnames(my_table[[spalte]])
-  Names <- Names[grepl("^bibliographicCitation", Names)]
+  Names <- Names[grepl(paste("^", COLUMN_NAME, sep=""), Names)]
  #Legt eine leeren DataFrame an
  temp <- data.frame()
  for (j in 1:length(Names)) {
-    #omit empty rows
+    #Verwirft leere Elemente
    df_temp <- na.omit(df_all[Names[j]])
-    #rename columns
+    #benennt die Spalten
-    colnames(df_temp) <- (c('bibliographicCitation'))
+    colnames(df_temp) <- (c(COLUMN_NAME))
-    #create a big dataframe
+    #Fügt die aktuellen Spalte zur Gesamtliste hinzu
    temp <- rbind(temp, df_temp)
  }
  my_table[[spalte]] <- temp
 }
-#combine all the columns of all tables into one
+#Fügt die Spalten aller Exceldatein zu einer Liste zusammen
-for (i in 2:length(my_table)) {
+if (length(my_table) > 1) {
  for (i in 2:length(my_table)) {
    my_table[[i]] <- rbind(my_table[[i-1]], my_table[[i]])
  }
  all <-  my_table[[length(my_table)]]
 }
-all <-  my_table[[length(my_table)]]
+if (length(my_table) == 1) {
  all <-  my_table
 }
-
+#Sucht alle Uniques
 #remove multiple spaces
 distincts <- lapply(all, FUN=str_squish)
 #get all unique values and sort
 distincts <-  unique(all)
 distincts <- data.frame(distincts[order(distincts$bibliographicCitation), ])
 colnames(distincts) <- (c('bibliographicCitation'))
 #Sortiert alphabetisch
 distincts <- data.frame(distincts[order(distincts$bibliographicCitation), ])
 #Benennt die Spalte 
 colnames(distincts) <- (c(COLUMN_NAME))
 #Entfernt Steuerungszeichen
 for (i in i:length(distincts$bibliographicCitation)) {
  distincts[i, 1] <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", distincts[i, 1], perl=TRUE)
 }
 #Sucht nochmals nach den Uniques
 distincts <- unique(distincts)
-
+#Entfernt alle Einträge die zu kurz (<THRESHOLD) sind
 distincts <- data.frame(distincts[nchar(distincts$bibliographicCitation) >= THRESHOLD, ])
 colnames(distincts) <- (c('bibliographicCitation'))
 #Schreibt die Distincts in eine Exceldatei
 write_xlsx(distincts, "data/Output/distincts_automated_gc3d.xlsx")