First
This commit is contained in:
parent
47462a438e
commit
0039d2b2de
6 changed files with 127 additions and 1 deletions
13
URI_Replacement.Rproj
Normal file
13
URI_Replacement.Rproj
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
Version: 1.0
|
||||||
|
|
||||||
|
RestoreWorkspace: Default
|
||||||
|
SaveWorkspace: Default
|
||||||
|
AlwaysSaveHistory: Default
|
||||||
|
|
||||||
|
EnableCodeIndexing: Yes
|
||||||
|
UseSpacesForTab: Yes
|
||||||
|
NumSpacesForTab: 2
|
||||||
|
Encoding: UTF-8
|
||||||
|
|
||||||
|
RnwWeave: Sweave
|
||||||
|
LaTeX: pdfLaTeX
|
||||||
BIN
data/GC3D_Limits_ref.xlsx
Normal file
BIN
data/GC3D_Limits_ref.xlsx
Normal file
Binary file not shown.
BIN
data/GC3D_Units_ref.xlsx
Normal file
BIN
data/GC3D_Units_ref.xlsx
Normal file
Binary file not shown.
BIN
data/inDB-2021_09_01.xlsx
Normal file
BIN
data/inDB-2021_09_01.xlsx
Normal file
Binary file not shown.
111
find_and_replace_uri.R
Normal file
111
find_and_replace_uri.R
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
library("readxl")
|
||||||
|
library("writexl")
|
||||||
|
library("dplyr")
|
||||||
|
library("xlsx")
|
||||||
|
library("stringi")
|
||||||
|
library("SPARQL")
|
||||||
|
library("stringr")
|
||||||
|
library("tictoc")
|
||||||
|
|
||||||
|
source("../sparql.R")
|
||||||
|
|
||||||
|
###### Konfiguration #########
|
||||||
|
|
||||||
|
#Arbeitsverzeichnis
|
||||||
|
setwd("C:/Users/linchr/ownCloud/GIT/URI_Replacement")
|
||||||
|
|
||||||
|
#Funktionen einbinden
|
||||||
|
source("sparql.R")
|
||||||
|
|
||||||
|
#Datei in der ersetzt werden soll
|
||||||
|
PFAD_EXCEL = "GC3D_Limits_ref.xlsx"
|
||||||
|
|
||||||
|
#Spaltennummer in denen ersetzt werden soll im Excel
|
||||||
|
SPALTEN = c(2,3,4,5,6)
|
||||||
|
|
||||||
|
#Spaltennummer mit der verglichen wird aus Datenbank (citations)
|
||||||
|
INHALTE_DB = 3
|
||||||
|
|
||||||
|
#Spaltennummer der Inhalte die eingesetzt werden soll (uris)
|
||||||
|
ID = 1
|
||||||
|
|
||||||
|
##############################
|
||||||
|
|
||||||
|
|
||||||
|
### Einlesen ###
|
||||||
|
inDB = get_current_geoera_lit_db()
|
||||||
|
|
||||||
|
to_replace_original = read_excel(PFAD_EXCEL)
|
||||||
|
to_replace_done = to_replace_original
|
||||||
|
|
||||||
|
###
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#Schleife über den Spaltenvektor aus der Excel
|
||||||
|
for(spalte_excel in SPALTEN[1]:SPALTEN[length(SPALTEN)]){
|
||||||
|
|
||||||
|
#Start der Zeitmessung für aktuelle Spalte
|
||||||
|
tic(paste("starte mit Spalte",spalte_excel))
|
||||||
|
|
||||||
|
for (zeile_excel in 1:nrow(to_replace_original[,spalte_excel])) {
|
||||||
|
|
||||||
|
current_excel = tolower(to_replace_original[zeile_excel,spalte_excel])
|
||||||
|
|
||||||
|
if(is.na(current_excel)){
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
if((nchar(current_excel) < 20)){
|
||||||
|
to_replace_done[zeile_excel, spalte_excel] = ""
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
excel_search_all = unlist(strsplit(gsub("[^[:alnum:] ]", "", current_excel), " +"))
|
||||||
|
excel_search_numbers = str_extract_all(current_excel,"\\(?[0-9]+\\)?")
|
||||||
|
|
||||||
|
for (zeile_db in 1:nrow(inDB[,INHALTE_DB])) {
|
||||||
|
|
||||||
|
current_db = tolower(inDB[zeile_db,INHALTE_DB])
|
||||||
|
|
||||||
|
db_search_all = unlist(strsplit(gsub("[^[:alnum:] ]", "", current_db), " +"))
|
||||||
|
db_search_numbers = str_extract_all(current_db,"\\(?[0-9]+\\)?")
|
||||||
|
|
||||||
|
count_all = 0
|
||||||
|
count_numbers = 0
|
||||||
|
|
||||||
|
for(k in 1:length(excel_search_all)){
|
||||||
|
if(excel_search_all[k] %in% db_search_all){
|
||||||
|
|
||||||
|
count_all = count_all + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(l in 1:length(excel_search_numbers)){
|
||||||
|
if(excel_search_numbers[l] %in% db_search_numbers){
|
||||||
|
|
||||||
|
count_numbers =+ count_numbers + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
percent_match_all = count_all/ length(excel_search_all)
|
||||||
|
percent_match_numbers = count_numbers/ length(excel_search_numbers)
|
||||||
|
|
||||||
|
if((percent_match_all > 0.80) && (percent_match_numbers > 0.99)){
|
||||||
|
|
||||||
|
|
||||||
|
to_replace_done[zeile_excel, spalte_excel] = str_remove_all(inDB[zeile_db,ID],"[<>]")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
toc()
|
||||||
|
}
|
||||||
|
|
||||||
|
toc()
|
||||||
|
|
||||||
|
pfad_output = paste("replaced_",PFAD_EXCEL, sep="")
|
||||||
|
|
||||||
|
write_xlsx(to_replace_done, pfad_output)
|
||||||
|
|
||||||
2
sparql.R
2
sparql.R
|
|
@ -54,3 +54,5 @@ get_current_geoera_lit_db <- function() {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#a change
|
||||||
Loading…
Add table
Add a link
Reference in a new issue