library("readxl") library("writexl") library("tidyverse") library("xlsx") setwd("C:/Users/linchr/ownCloud/GIT/URI_Replacement") pfad <- "data/Input/" #select all *.xlsx files inside the Input-Folder and put them into a list file_list <- list.files(pfad, "*.xlsx") my_table <- c() for (file in 1:length(file_list)) { temp_pfad <- paste(pfad, file_list[file], sep="") my_table[[file]] <- read_excel(temp_pfad) } #select the bibliographicCitation columns in all tables for(spalte in 1:length(my_table)) { df_all <- data.frame(my_table[[spalte]]) Names <- colnames(my_table[[spalte]]) Names <- Names[grepl("^bibliographicCitation", Names)] temp <- data.frame() for (j in 1:length(Names)) { #omit empty rows df_temp <- na.omit(df_all[Names[j]]) #rename columns colnames(df_temp) <- (c('bibliographicCitation')) #create a big dataframe temp <- rbind(temp, df_temp) } my_table[[spalte]] <- temp } #combine all the columns of all tables into one for (i in 2:length(my_table)) { my_table[[i]] <- rbind(my_table[[i-1]], my_table[[i]]) } all <- my_table[[length(my_table)]] #remove multiple spaces distincts <- lapply(all, FUN=str_squish) #get all unique values and sort distincts <- unique(all) distincts <- data.frame(distincts[order(distincts$bibliographicCitation), ]) colnames(distincts) <- (c('bibliographicCitation')) for (i in i:length(distincts$bibliographicCitation)) { distincts[i, 1] <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", distincts[i, 1], perl=TRUE) } distincts <- unique(distincts) distincts <- data.frame(distincts[nchar(distincts$bibliographicCitation) >= 21, ]) colnames(distincts) <- (c('bibliographicCitation')) write_xlsx(distincts, "data/Output/distincts_automated_gc3d.xlsx")