From 667dc7d68d8142342d9018b3991cd5ffecfaaef0 Mon Sep 17 00:00:00 2001 From: Linsberger Christian Date: Thu, 2 Sep 2021 14:00:44 +0200 Subject: [PATCH] =?UTF-8?q?Generelles=20Zahlenmatching=20auf=20Jahrmatchin?= =?UTF-8?q?g=20ge=C3=A4ndert.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- find_and_replace_uri.R | 27 ++++++++++++++++----------- find_new_entries.R | 11 +++++++---- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/find_and_replace_uri.R b/find_and_replace_uri.R index 8239d16..b927747 100644 --- a/find_and_replace_uri.R +++ b/find_and_replace_uri.R @@ -6,6 +6,7 @@ library("stringi") library("SPARQL") library("stringr") library("tictoc") +library("rlang") source("../sparql.R") @@ -63,17 +64,19 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) { } excel_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +")) - excel_search_numbers <- str_extract_all(current_excel, "\\(?[0-9]+\\)?") + excel_search_numbers <- unlist(str_extract_all(current_excel, "(?:19|20)\\d{2}")) + for (zeile_db in 1:nrow(inDB[, INHALTE_DB])) { current_db <- tolower(inDB[zeile_db,INHALTE_DB]) db_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +")) - db_search_numbers <- str_extract_all(current_db, "\\(?[0-9]+\\)?") + db_search_numbers <- unlist(str_extract_all(current_db, "(?:19|20)\\d{2}")) count_all <- 0 count_numbers <- 0 + percent_match_numbers <- 0.1 for (k in 1:length(excel_search_all)) { if (excel_search_all[k] %in% db_search_all) { @@ -81,22 +84,25 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) { count_all <- count_all + 1 } } - - for (l in 1:length(excel_search_numbers)) { - if (excel_search_numbers[l] %in% db_search_numbers) { - - count_numbers <- count_numbers + 1 + + if ((length(db_search_numbers) > 0) && (length(excel_search_numbers) > 0)) { + for (l in 1:length(excel_search_numbers)) { + if (excel_search_numbers[l] %in% db_search_numbers) { + + count_numbers <- count_numbers + 1 + } } } percent_match_all <- count_all / length(excel_search_all) - percent_match_numbers <- count_numbers / length(excel_search_numbers) + + if (length(excel_search_numbers) > 0) { + percent_match_numbers <- count_numbers / length(excel_search_numbers) + } if ((percent_match_all > 0.80) && (percent_match_numbers > 0.99)) { - to_replace_done[zeile_excel, spalte_excel] <- str_remove_all(inDB[zeile_db,ID], "[<>]") - } } @@ -107,6 +113,5 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) { toc() pfad_output <- paste(PFAD_EXCEL, "replaced_" , FILENAME_EXCEL, sep="") - write_xlsx(to_replace_done, pfad_output) diff --git a/find_new_entries.R b/find_new_entries.R index 935f483..0550a68 100644 --- a/find_new_entries.R +++ b/find_new_entries.R @@ -20,24 +20,27 @@ for (file in 1:length(file_list)) { #select the bibliographicCitation columns in all tables -for(i in 1:length(my_table)) { - df_all <- data.frame(my_table[[i]]) +for(spalte in 1:length(my_table)) { + df_all <- data.frame(my_table[[spalte]]) - Names <- colnames(my_table[[i]]) + Names <- colnames(my_table[[spalte]]) Names <- Names[grepl("^bibliographicCitation", Names)] temp <- data.frame() for (j in 1:length(Names)) { + #omit empty rows df_temp <- na.omit(df_all[Names[j]]) + #rename columns colnames(df_temp) <- (c('bibliographicCitation')) + #create a big dataframe temp <- rbind(temp, df_temp) } - my_table[[i]] <- temp + my_table[[spalte]] <- temp } #combine all the columns of all tables into one