2 New Files, Renamed.
This commit is contained in:
parent
0ba9850f10
commit
baef13edde
4 changed files with 38 additions and 1 deletions
86
01_find_new_entries.R
Normal file
86
01_find_new_entries.R
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
library("readxl")
|
||||
library("writexl")
|
||||
library("tidyverse")
|
||||
library("xlsx")
|
||||
|
||||
source("config/config.R")
|
||||
|
||||
#Übernimmt die Liste der zu durchsuchenden Excelfiles aus der Konfiguration
|
||||
file_list <- FIND_DISTINCTS_FILES
|
||||
|
||||
#Legt eine leere List zum Befüllen an
|
||||
my_table <- c()
|
||||
|
||||
|
||||
#Liest die Excelfiles ein
|
||||
for (file in 1:length(file_list)) {
|
||||
temp_pfad <- paste(PFAD_EXCEL, file_list[file], sep="")
|
||||
my_table[[file]] <- read_excel(temp_pfad)
|
||||
}
|
||||
|
||||
|
||||
#Wählt die Spalten aus in denen die Distincts gesucht werden
|
||||
for(spalte in 1:length(my_table)) {
|
||||
|
||||
#Akutelle Spalte in die Liste
|
||||
df_all <- data.frame(my_table[[spalte]])
|
||||
|
||||
#Liefert die Spaltenname die der Vorgabe (COLUMN_NAME) entsprechen
|
||||
Names <- colnames(my_table[[spalte]])
|
||||
Names <- Names[grepl(paste("^", COLUMN_NAME, sep=""), Names)]
|
||||
|
||||
|
||||
#Legt eine leeren DataFrame an
|
||||
temp <- data.frame()
|
||||
|
||||
for (j in 1:length(Names)) {
|
||||
|
||||
#Verwirft leere Elemente
|
||||
df_temp <- na.omit(df_all[Names[j]])
|
||||
|
||||
#benennt die Spalten
|
||||
colnames(df_temp) <- (c(COLUMN_NAME))
|
||||
|
||||
#Fügt die aktuellen Spalte zur Gesamtliste hinzu
|
||||
temp <- rbind(temp, df_temp)
|
||||
}
|
||||
|
||||
my_table[[spalte]] <- temp
|
||||
}
|
||||
|
||||
#Fügt die Spalten aller Exceldatein zu einer Liste zusammen
|
||||
if (length(my_table) > 1) {
|
||||
for (i in 2:length(my_table)) {
|
||||
my_table[[i]] <- rbind(my_table[[i-1]], my_table[[i]])
|
||||
}
|
||||
|
||||
all <- my_table[[length(my_table)]]
|
||||
}
|
||||
|
||||
if (length(my_table) == 1) {
|
||||
all <- my_table
|
||||
}
|
||||
|
||||
#Sucht alle Uniques
|
||||
distincts <- unique(all)
|
||||
|
||||
#Sortiert alphabetisch
|
||||
distincts <- data.frame(distincts[order(distincts$bibliographicCitation), ])
|
||||
|
||||
#Benennt die Spalte
|
||||
colnames(distincts) <- (c(COLUMN_NAME))
|
||||
|
||||
#Entfernt Steuerungszeichen
|
||||
for (i in i:length(distincts$bibliographicCitation)) {
|
||||
distincts[i, 1] <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", distincts[i, 1], perl=TRUE)
|
||||
}
|
||||
|
||||
#Sucht nochmals nach den Uniques
|
||||
distincts <- unique(distincts)
|
||||
|
||||
#Entfernt alle Einträge die zu kurz (<THRESHOLD) sind
|
||||
distincts <- data.frame(distincts[nchar(distincts$bibliographicCitation) >= THRESHOLD, ])
|
||||
colnames(distincts) <- (c('bibliographicCitation'))
|
||||
|
||||
#Schreibt die Distincts in eine Exceldatei
|
||||
write_xlsx(distincts, paste(PFAD_OUT, "distincts_automated_gc3d.xlsx", sep=""))
|
||||
Loading…
Add table
Add a link
Reference in a new issue