71 lines
1.7 KiB
R
71 lines
1.7 KiB
R
library("readxl")
|
|
library("writexl")
|
|
library("tidyverse")
|
|
library("xlsx")
|
|
|
|
|
|
|
|
pfad <- "data/Input/"
|
|
|
|
#select all *.xlsx files inside the Input-Folder and put them into a list
|
|
file_list <- list.files(pfad, "*.xlsx")
|
|
|
|
my_table <- c()
|
|
|
|
for (file in 1:length(file_list)) {
|
|
temp_pfad <- paste(pfad, file_list[file], sep="")
|
|
my_table[[file]] <- read_excel(temp_pfad)
|
|
}
|
|
|
|
|
|
|
|
#select the bibliographicCitation columns in all tables
|
|
for(spalte in 1:length(my_table)) {
|
|
df_all <- data.frame(my_table[[spalte]])
|
|
|
|
Names <- colnames(my_table[[spalte]])
|
|
Names <- Names[grepl("^bibliographicCitation", Names)]
|
|
|
|
temp <- data.frame()
|
|
|
|
for (j in 1:length(Names)) {
|
|
|
|
#omit empty rows
|
|
df_temp <- na.omit(df_all[Names[j]])
|
|
|
|
#rename columns
|
|
colnames(df_temp) <- (c('bibliographicCitation'))
|
|
|
|
#create a big dataframe
|
|
temp <- rbind(temp, df_temp)
|
|
}
|
|
|
|
my_table[[spalte]] <- temp
|
|
}
|
|
|
|
#combine all the columns of all tables into one
|
|
for (i in 2:length(my_table)) {
|
|
my_table[[i]] <- rbind(my_table[[i-1]], my_table[[i]])
|
|
}
|
|
|
|
all <- my_table[[length(my_table)]]
|
|
|
|
|
|
#remove multiple spaces
|
|
distincts <- lapply(all, FUN=str_squish)
|
|
#get all unique values and sort
|
|
distincts <- unique(all)
|
|
distincts <- data.frame(distincts[order(distincts$bibliographicCitation), ])
|
|
colnames(distincts) <- (c('bibliographicCitation'))
|
|
|
|
for (i in i:length(distincts$bibliographicCitation)) {
|
|
distincts[i, 1] <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", distincts[i, 1], perl=TRUE)
|
|
}
|
|
|
|
distincts <- unique(distincts)
|
|
|
|
|
|
distincts <- data.frame(distincts[nchar(distincts$bibliographicCitation) >= 21, ])
|
|
colnames(distincts) <- (c('bibliographicCitation'))
|
|
|
|
write_xlsx(distincts, "data/Output/distincts_automated_gc3d.xlsx")
|