From 667dc7d68d8142342d9018b3991cd5ffecfaaef0 Mon Sep 17 00:00:00 2001
From: Linsberger Christian <linchr@gba.geolba.ac.at>
Date: Thu, 2 Sep 2021 14:00:44 +0200
Subject: [PATCH] =?UTF-8?q?Generelles=20Zahlenmatching=20auf=20Jahrmatchin?=
 =?UTF-8?q?g=20ge=C3=A4ndert.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 find_and_replace_uri.R | 27 ++++++++++++++++-----------
 find_new_entries.R     | 11 +++++++----
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/find_and_replace_uri.R b/find_and_replace_uri.R
index 8239d16..b927747 100644
--- a/find_and_replace_uri.R
+++ b/find_and_replace_uri.R
@@ -6,6 +6,7 @@ library("stringi")
 library("SPARQL")
 library("stringr")
 library("tictoc")
+library("rlang")
 
 source("../sparql.R")
 
@@ -63,17 +64,19 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
     }
     
     excel_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +"))
-    excel_search_numbers <- str_extract_all(current_excel, "\\(?[0-9]+\\)?")
+    excel_search_numbers <- unlist(str_extract_all(current_excel, "(?:19|20)\\d{2}"))
+  
     
     for (zeile_db in 1:nrow(inDB[, INHALTE_DB])) {
       
       current_db <- tolower(inDB[zeile_db,INHALTE_DB])
       
       db_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +"))
-      db_search_numbers <- str_extract_all(current_db, "\\(?[0-9]+\\)?")
+      db_search_numbers <- unlist(str_extract_all(current_db, "(?:19|20)\\d{2}"))
       
       count_all <- 0
       count_numbers <- 0
+      percent_match_numbers <- 0.1
       
       for (k in 1:length(excel_search_all)) {
         if (excel_search_all[k] %in% db_search_all) {
@@ -81,22 +84,25 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
           count_all <- count_all + 1
         }
       }
-      
-      for (l in 1:length(excel_search_numbers)) {
-        if (excel_search_numbers[l] %in% db_search_numbers) {
-          
-          count_numbers <- count_numbers + 1
+       
+      if ((length(db_search_numbers) > 0) && (length(excel_search_numbers) > 0)) {
+        for (l in 1:length(excel_search_numbers)) {
+          if (excel_search_numbers[l] %in% db_search_numbers) {
+            
+            count_numbers <- count_numbers + 1
+          }
         }
       }
       
       percent_match_all <- count_all / length(excel_search_all)
-      percent_match_numbers <- count_numbers / length(excel_search_numbers)
+      
+      if (length(excel_search_numbers) > 0) {
+        percent_match_numbers <- count_numbers / length(excel_search_numbers)
+      }
       
       if ((percent_match_all > 0.80) && (percent_match_numbers > 0.99)) {
         
-        
         to_replace_done[zeile_excel, spalte_excel] <- str_remove_all(inDB[zeile_db,ID], "[<>]")
-        
       }
       
     }
@@ -107,6 +113,5 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
 toc()
 
 pfad_output <- paste(PFAD_EXCEL, "replaced_" , FILENAME_EXCEL, sep="")
-
 write_xlsx(to_replace_done, pfad_output)
 
diff --git a/find_new_entries.R b/find_new_entries.R
index 935f483..0550a68 100644
--- a/find_new_entries.R
+++ b/find_new_entries.R
@@ -20,24 +20,27 @@ for (file in 1:length(file_list)) {
 
 
 #select the bibliographicCitation columns in all tables
-for(i in 1:length(my_table))  {
-  df_all <- data.frame(my_table[[i]])
+for(spalte in 1:length(my_table))  {
+  df_all <- data.frame(my_table[[spalte]])
   
-  Names <- colnames(my_table[[i]])
+  Names <- colnames(my_table[[spalte]])
   Names <- Names[grepl("^bibliographicCitation", Names)]
   
   temp <- data.frame()
   
   for (j in 1:length(Names)) {
+    
     #omit empty rows
     df_temp <- na.omit(df_all[Names[j]])
+    
     #rename columns
     colnames(df_temp) <- (c('bibliographicCitation'))
+    
     #create a big dataframe
     temp <- rbind(temp, df_temp)
   }
   
-  my_table[[i]] <- temp
+  my_table[[spalte]] <- temp
 }
 
 #combine all the columns of all tables into one