From faf4f38dac36da5b2e35ff9e5df8d32ce00d7aae Mon Sep 17 00:00:00 2001 From: Linsberger Christian Date: Wed, 8 Sep 2021 09:04:52 +0200 Subject: [PATCH] structural changes --- 01_find_new_entries.R | 1 + 02_not_in_db.R | 2 +- 03_find_URLs.R | 31 +------------------------------ 04_find_and_replace_uri.R | 2 +- sparql.R => functions.R | 29 +++++++++++++++++++++++++++++ 5 files changed, 33 insertions(+), 32 deletions(-) rename sparql.R => functions.R (56%) diff --git a/01_find_new_entries.R b/01_find_new_entries.R index ad9e139..6d9a014 100644 --- a/01_find_new_entries.R +++ b/01_find_new_entries.R @@ -4,6 +4,7 @@ library("tidyverse") library("xlsx") source("config/config.R") +source("functions.R") #Übernimmt die Liste der zu durchsuchenden Excelfiles aus der Konfiguration file_list <- FIND_DISTINCTS_FILES diff --git a/02_not_in_db.R b/02_not_in_db.R index 57cac6f..a75cdd3 100644 --- a/02_not_in_db.R +++ b/02_not_in_db.R @@ -8,7 +8,7 @@ library("tidyverse") #Konfiguration einbinden source("config/config.R") -source("sparql.R") +source("functions.R") #Datenbank einlesen und doppelte entfernen inDB <- data.frame(get_current_geoera_lit_db()$citation) diff --git a/03_find_URLs.R b/03_find_URLs.R index 19d2df2..4352368 100644 --- a/03_find_URLs.R +++ b/03_find_URLs.R @@ -4,36 +4,7 @@ library("writexl") #Konfiguration einbinden source("config/config.R") - -## nicht meine Funktion - eingebunden von https://rdrr.io/cran/retractcheck/src/R/utils.R -find_doi <- function (strings) { - regex <- '10\\.\\d{4,9}/[-._;()/:A-Z0-9]+' - doiLoc <- gregexpr(text = strings, pattern = regex, perl = TRUE, ignore.case = TRUE) - - i <- 1 - res <- NULL - - # for each in the doiLoc list check whether match (!-1) - for ( i in 1:length(doiLoc) ) { - if ( doiLoc[[i]][1] != -1 ) { - for ( j in 1:length(doiLoc[[i]]) ) { - res <- c(res, - substring(strings[i], doiLoc[[i]][j], doiLoc[[i]][j] + attr(doiLoc[[i]], 'match.length')[j] - 1)) - } - } - } - - return(res) -} - - -## nicht meine Funktion - eingebunden von https://stackoverflow.com/questions/52911812/check-if-url-exists-in-r -valid_url <- function(url_in,t=2){ - con <- url(url_in) - check <- suppressWarnings(try(open.connection(con,open="rt",timeout=t),silent=T)[1]) - suppressWarnings(try(close.connection(con),silent=T)) - ifelse(is.null(check),TRUE,FALSE) -} +source("functions.R") url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" diff --git a/04_find_and_replace_uri.R b/04_find_and_replace_uri.R index 5567e9f..cb9c778 100644 --- a/04_find_and_replace_uri.R +++ b/04_find_and_replace_uri.R @@ -8,7 +8,7 @@ library("tictoc") library("rlang") #Funktionen & Konfiguration einbinden -source("sparql.R") +source("functions.R") source("config/config.R") diff --git a/sparql.R b/functions.R similarity index 56% rename from sparql.R rename to functions.R index c824852..cc2c62d 100644 --- a/sparql.R +++ b/functions.R @@ -59,3 +59,32 @@ get_current_geoera_lit_db <- function() { } +## nicht meine Funktion - eingebunden von https://rdrr.io/cran/retractcheck/src/R/utils.R +find_doi <- function (strings) { + regex <- '10\\.\\d{4,9}/[-._;()/:A-Z0-9]+' + doiLoc <- gregexpr(text = strings, pattern = regex, perl = TRUE, ignore.case = TRUE) + + i <- 1 + res <- NULL + + # for each in the doiLoc list check whether match (!-1) + for ( i in 1:length(doiLoc) ) { + if ( doiLoc[[i]][1] != -1 ) { + for ( j in 1:length(doiLoc[[i]]) ) { + res <- c(res, + substring(strings[i], doiLoc[[i]][j], doiLoc[[i]][j] + attr(doiLoc[[i]], 'match.length')[j] - 1)) + } + } + } + + return(res) +} + + +## nicht meine Funktion - eingebunden von https://stackoverflow.com/questions/52911812/check-if-url-exists-in-r +valid_url <- function(url_in,t=2){ + con <- url(url_in) + check <- suppressWarnings(try(open.connection(con,open="rt",timeout=t),silent=T)[1]) + suppressWarnings(try(close.connection(con),silent=T)) + ifelse(is.null(check),TRUE,FALSE) +}