Generelles Zahlenmatching auf Jahrmatching geändert.
This commit is contained in:
parent
52ed5e8984
commit
667dc7d68d
2 changed files with 23 additions and 15 deletions
|
|
@ -6,6 +6,7 @@ library("stringi")
|
||||||
library("SPARQL")
|
library("SPARQL")
|
||||||
library("stringr")
|
library("stringr")
|
||||||
library("tictoc")
|
library("tictoc")
|
||||||
|
library("rlang")
|
||||||
|
|
||||||
source("../sparql.R")
|
source("../sparql.R")
|
||||||
|
|
||||||
|
|
@ -63,17 +64,19 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
|
||||||
}
|
}
|
||||||
|
|
||||||
excel_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +"))
|
excel_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +"))
|
||||||
excel_search_numbers <- str_extract_all(current_excel, "\\(?[0-9]+\\)?")
|
excel_search_numbers <- unlist(str_extract_all(current_excel, "(?:19|20)\\d{2}"))
|
||||||
|
|
||||||
|
|
||||||
for (zeile_db in 1:nrow(inDB[, INHALTE_DB])) {
|
for (zeile_db in 1:nrow(inDB[, INHALTE_DB])) {
|
||||||
|
|
||||||
current_db <- tolower(inDB[zeile_db,INHALTE_DB])
|
current_db <- tolower(inDB[zeile_db,INHALTE_DB])
|
||||||
|
|
||||||
db_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +"))
|
db_search_all <- unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +"))
|
||||||
db_search_numbers <- str_extract_all(current_db, "\\(?[0-9]+\\)?")
|
db_search_numbers <- unlist(str_extract_all(current_db, "(?:19|20)\\d{2}"))
|
||||||
|
|
||||||
count_all <- 0
|
count_all <- 0
|
||||||
count_numbers <- 0
|
count_numbers <- 0
|
||||||
|
percent_match_numbers <- 0.1
|
||||||
|
|
||||||
for (k in 1:length(excel_search_all)) {
|
for (k in 1:length(excel_search_all)) {
|
||||||
if (excel_search_all[k] %in% db_search_all) {
|
if (excel_search_all[k] %in% db_search_all) {
|
||||||
|
|
@ -81,22 +84,25 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
|
||||||
count_all <- count_all + 1
|
count_all <- count_all + 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (l in 1:length(excel_search_numbers)) {
|
if ((length(db_search_numbers) > 0) && (length(excel_search_numbers) > 0)) {
|
||||||
if (excel_search_numbers[l] %in% db_search_numbers) {
|
for (l in 1:length(excel_search_numbers)) {
|
||||||
|
if (excel_search_numbers[l] %in% db_search_numbers) {
|
||||||
count_numbers <- count_numbers + 1
|
|
||||||
|
count_numbers <- count_numbers + 1
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
percent_match_all <- count_all / length(excel_search_all)
|
percent_match_all <- count_all / length(excel_search_all)
|
||||||
percent_match_numbers <- count_numbers / length(excel_search_numbers)
|
|
||||||
|
if (length(excel_search_numbers) > 0) {
|
||||||
|
percent_match_numbers <- count_numbers / length(excel_search_numbers)
|
||||||
|
}
|
||||||
|
|
||||||
if ((percent_match_all > 0.80) && (percent_match_numbers > 0.99)) {
|
if ((percent_match_all > 0.80) && (percent_match_numbers > 0.99)) {
|
||||||
|
|
||||||
|
|
||||||
to_replace_done[zeile_excel, spalte_excel] <- str_remove_all(inDB[zeile_db,ID], "[<>]")
|
to_replace_done[zeile_excel, spalte_excel] <- str_remove_all(inDB[zeile_db,ID], "[<>]")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -107,6 +113,5 @@ for (spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]) {
|
||||||
toc()
|
toc()
|
||||||
|
|
||||||
pfad_output <- paste(PFAD_EXCEL, "replaced_" , FILENAME_EXCEL, sep="")
|
pfad_output <- paste(PFAD_EXCEL, "replaced_" , FILENAME_EXCEL, sep="")
|
||||||
|
|
||||||
write_xlsx(to_replace_done, pfad_output)
|
write_xlsx(to_replace_done, pfad_output)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,24 +20,27 @@ for (file in 1:length(file_list)) {
|
||||||
|
|
||||||
|
|
||||||
#select the bibliographicCitation columns in all tables
|
#select the bibliographicCitation columns in all tables
|
||||||
for(i in 1:length(my_table)) {
|
for(spalte in 1:length(my_table)) {
|
||||||
df_all <- data.frame(my_table[[i]])
|
df_all <- data.frame(my_table[[spalte]])
|
||||||
|
|
||||||
Names <- colnames(my_table[[i]])
|
Names <- colnames(my_table[[spalte]])
|
||||||
Names <- Names[grepl("^bibliographicCitation", Names)]
|
Names <- Names[grepl("^bibliographicCitation", Names)]
|
||||||
|
|
||||||
temp <- data.frame()
|
temp <- data.frame()
|
||||||
|
|
||||||
for (j in 1:length(Names)) {
|
for (j in 1:length(Names)) {
|
||||||
|
|
||||||
#omit empty rows
|
#omit empty rows
|
||||||
df_temp <- na.omit(df_all[Names[j]])
|
df_temp <- na.omit(df_all[Names[j]])
|
||||||
|
|
||||||
#rename columns
|
#rename columns
|
||||||
colnames(df_temp) <- (c('bibliographicCitation'))
|
colnames(df_temp) <- (c('bibliographicCitation'))
|
||||||
|
|
||||||
#create a big dataframe
|
#create a big dataframe
|
||||||
temp <- rbind(temp, df_temp)
|
temp <- rbind(temp, df_temp)
|
||||||
}
|
}
|
||||||
|
|
||||||
my_table[[i]] <- temp
|
my_table[[spalte]] <- temp
|
||||||
}
|
}
|
||||||
|
|
||||||
#combine all the columns of all tables into one
|
#combine all the columns of all tables into one
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue