Comments & more Config
This commit is contained in:
parent
6a8966a263
commit
8a1fa79065
3 changed files with 46 additions and 22 deletions
|
|
@ -6,9 +6,18 @@ PFAD_EXCEL <- "data/Input/"
|
||||||
PFAD_OUT <- "data/Output/"
|
PFAD_OUT <- "data/Output/"
|
||||||
PFAD_DB_OUT <- "data/Database/"
|
PFAD_DB_OUT <- "data/Database/"
|
||||||
|
|
||||||
|
#Liste der Dateien in denen die Distincts gesucht werden
|
||||||
|
FIND_DISTINCTS_FILES = c("GC3D_Limits_ref.xlsx", "GC3D_Units_ref.xlsx")
|
||||||
|
|
||||||
#Datei in der ersetzt werden soll
|
#Datei in der ersetzt werden soll
|
||||||
FILENAME_EXCEL <- "project vocabulary_BB.xlsx"
|
FILENAME_EXCEL <- "project vocabulary_BB.xlsx"
|
||||||
|
|
||||||
|
#Name des Excelsheets in der ersetzt werden soll
|
||||||
|
SHEETNAME <- "Projektvokabular"
|
||||||
|
|
||||||
|
#Spaltennamen(Muster) in denen die Distincts gewerden
|
||||||
|
COLUMN_NAME <- "bibliographicCitation"
|
||||||
|
|
||||||
#Spaltennummer in denen ersetzt werden soll im Excel
|
#Spaltennummer in denen ersetzt werden soll im Excel
|
||||||
SPALTEN <- c(10, 11, 12, 13)
|
SPALTEN <- c(10, 11, 12, 13)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,10 +15,9 @@ source("config/config.R")
|
||||||
### Einlesen ###
|
### Einlesen ###
|
||||||
inDB <- get_current_geoera_lit_db()
|
inDB <- get_current_geoera_lit_db()
|
||||||
|
|
||||||
to_replace_original <- read_excel(paste(PFAD_EXCEL, FILENAME_EXCEL, sep=""), sheet = 1)
|
to_replace_original <- read_excel(paste(PFAD_EXCEL, FILENAME_EXCEL, sep=""), sheet = SHEETNAME)
|
||||||
to_replace_done <- to_replace_original
|
to_replace_done <- to_replace_original
|
||||||
|
|
||||||
|
|
||||||
#Erste Schleife über den Spaltenvektor aus der Excel
|
#Erste Schleife über den Spaltenvektor aus der Excel
|
||||||
for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
|
for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,66 +5,82 @@ library("xlsx")
|
||||||
|
|
||||||
source("config/config.R")
|
source("config/config.R")
|
||||||
|
|
||||||
|
#Übernimmt die Liste der zu durchsuchenden Excelfiles aus der Konfiguration
|
||||||
|
file_list <- FIND_DISTINCTS_FILES
|
||||||
|
|
||||||
#select all *.xlsx files inside the Input-Folder and put them into a list
|
#Legt eine leere List zum Befüllen an
|
||||||
file_list <- list.files(PFAD_EXCEL, "*.xlsx")
|
|
||||||
|
|
||||||
my_table <- c()
|
my_table <- c()
|
||||||
|
|
||||||
|
|
||||||
|
#Liest die Excelfiles ein
|
||||||
for (file in 1:length(file_list)) {
|
for (file in 1:length(file_list)) {
|
||||||
temp_pfad <- paste(PFAD_EXCEL, file_list[file], sep="")
|
temp_pfad <- paste(PFAD_EXCEL, file_list[file], sep="")
|
||||||
my_table[[file]] <- read_excel(temp_pfad)
|
my_table[[file]] <- read_excel(temp_pfad)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#Wählt die Spalten aus in denen die Distincts gesucht werden
|
||||||
#select the bibliographicCitation columns in all tables
|
|
||||||
for(spalte in 1:length(my_table)) {
|
for(spalte in 1:length(my_table)) {
|
||||||
|
|
||||||
|
#Akutelle Spalte in die Liste
|
||||||
df_all <- data.frame(my_table[[spalte]])
|
df_all <- data.frame(my_table[[spalte]])
|
||||||
|
|
||||||
|
#Liefert die Spaltenname die der Vorgabe (COLUMN_NAME) entsprechen
|
||||||
Names <- colnames(my_table[[spalte]])
|
Names <- colnames(my_table[[spalte]])
|
||||||
Names <- Names[grepl("^bibliographicCitation", Names)]
|
Names <- Names[grepl(paste("^", COLUMN_NAME, sep=""), Names)]
|
||||||
|
|
||||||
|
|
||||||
|
#Legt eine leeren DataFrame an
|
||||||
temp <- data.frame()
|
temp <- data.frame()
|
||||||
|
|
||||||
for (j in 1:length(Names)) {
|
for (j in 1:length(Names)) {
|
||||||
|
|
||||||
#omit empty rows
|
#Verwirft leere Elemente
|
||||||
df_temp <- na.omit(df_all[Names[j]])
|
df_temp <- na.omit(df_all[Names[j]])
|
||||||
|
|
||||||
#rename columns
|
#benennt die Spalten
|
||||||
colnames(df_temp) <- (c('bibliographicCitation'))
|
colnames(df_temp) <- (c(COLUMN_NAME))
|
||||||
|
|
||||||
#create a big dataframe
|
#Fügt die aktuellen Spalte zur Gesamtliste hinzu
|
||||||
temp <- rbind(temp, df_temp)
|
temp <- rbind(temp, df_temp)
|
||||||
}
|
}
|
||||||
|
|
||||||
my_table[[spalte]] <- temp
|
my_table[[spalte]] <- temp
|
||||||
}
|
}
|
||||||
|
|
||||||
#combine all the columns of all tables into one
|
#Fügt die Spalten aller Exceldatein zu einer Liste zusammen
|
||||||
|
if (length(my_table) > 1) {
|
||||||
for (i in 2:length(my_table)) {
|
for (i in 2:length(my_table)) {
|
||||||
my_table[[i]] <- rbind(my_table[[i-1]], my_table[[i]])
|
my_table[[i]] <- rbind(my_table[[i-1]], my_table[[i]])
|
||||||
}
|
}
|
||||||
|
|
||||||
all <- my_table[[length(my_table)]]
|
all <- my_table[[length(my_table)]]
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length(my_table) == 1) {
|
||||||
|
all <- my_table
|
||||||
|
}
|
||||||
|
|
||||||
#remove multiple spaces
|
#Sucht alle Uniques
|
||||||
distincts <- lapply(all, FUN=str_squish)
|
|
||||||
#get all unique values and sort
|
|
||||||
distincts <- unique(all)
|
distincts <- unique(all)
|
||||||
distincts <- data.frame(distincts[order(distincts$bibliographicCitation), ])
|
|
||||||
colnames(distincts) <- (c('bibliographicCitation'))
|
|
||||||
|
|
||||||
|
#Sortiert alphabetisch
|
||||||
|
distincts <- data.frame(distincts[order(distincts$bibliographicCitation), ])
|
||||||
|
|
||||||
|
#Benennt die Spalte
|
||||||
|
colnames(distincts) <- (c(COLUMN_NAME))
|
||||||
|
|
||||||
|
#Entfernt Steuerungszeichen
|
||||||
for (i in i:length(distincts$bibliographicCitation)) {
|
for (i in i:length(distincts$bibliographicCitation)) {
|
||||||
distincts[i, 1] <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", distincts[i, 1], perl=TRUE)
|
distincts[i, 1] <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", distincts[i, 1], perl=TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#Sucht nochmals nach den Uniques
|
||||||
distincts <- unique(distincts)
|
distincts <- unique(distincts)
|
||||||
|
|
||||||
|
#Entfernt alle Einträge die zu kurz (<THRESHOLD) sind
|
||||||
distincts <- data.frame(distincts[nchar(distincts$bibliographicCitation) >= THRESHOLD, ])
|
distincts <- data.frame(distincts[nchar(distincts$bibliographicCitation) >= THRESHOLD, ])
|
||||||
colnames(distincts) <- (c('bibliographicCitation'))
|
colnames(distincts) <- (c('bibliographicCitation'))
|
||||||
|
|
||||||
|
#Schreibt die Distincts in eine Exceldatei
|
||||||
write_xlsx(distincts, "data/Output/distincts_automated_gc3d.xlsx")
|
write_xlsx(distincts, "data/Output/distincts_automated_gc3d.xlsx")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue