diff --git a/find_new_entries.R b/find_new_entries.R new file mode 100644 index 0000000..935f483 --- /dev/null +++ b/find_new_entries.R @@ -0,0 +1,68 @@ +library("readxl") +library("writexl") +library("tidyverse") +library("xlsx") + +setwd("C:/Users/linchr/ownCloud/GIT/URI_Replacement") + +pfad <- "data/Input/" + +#select all *.xlsx files inside the Input-Folder and put them into a list +file_list <- list.files(pfad, "*.xlsx") + +my_table <- c() + +for (file in 1:length(file_list)) { + temp_pfad <- paste(pfad, file_list[file], sep="") + my_table[[file]] <- read_excel(temp_pfad) +} + + + +#select the bibliographicCitation columns in all tables +for(i in 1:length(my_table)) { + df_all <- data.frame(my_table[[i]]) + + Names <- colnames(my_table[[i]]) + Names <- Names[grepl("^bibliographicCitation", Names)] + + temp <- data.frame() + + for (j in 1:length(Names)) { + #omit empty rows + df_temp <- na.omit(df_all[Names[j]]) + #rename columns + colnames(df_temp) <- (c('bibliographicCitation')) + #create a big dataframe + temp <- rbind(temp, df_temp) + } + + my_table[[i]] <- temp +} + +#combine all the columns of all tables into one +for (i in 2:length(my_table)) { + my_table[[i]] <- rbind(my_table[[i-1]], my_table[[i]]) +} + +all <- my_table[[length(my_table)]] + + +#remove multiple spaces +distincts <- lapply(all, FUN=str_squish) +#get all unique values and sort +distincts <- unique(all) +distincts <- data.frame(distincts[order(distincts$bibliographicCitation), ]) +colnames(distincts) <- (c('bibliographicCitation')) + +for (i in i:length(distincts$bibliographicCitation)) { + distincts[i, 1] <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", distincts[i, 1], perl=TRUE) +} + +distincts <- unique(distincts) + + +distincts <- data.frame(distincts[nchar(distincts$bibliographicCitation) >= 21, ]) +colnames(distincts) <- (c('bibliographicCitation')) + +write_xlsx(distincts, "data/Output/distincts_automated_gc3d.xlsx")