From fba552577ec4667397f3818ac9a36185504fd98d Mon Sep 17 00:00:00 2001 From: Linsberger Christian Date: Tue, 7 Sep 2021 08:12:19 +0200 Subject: [PATCH] added a script that finds URLs an puts them in a new column --- find_URLs.R | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 find_URLs.R diff --git a/find_URLs.R b/find_URLs.R new file mode 100644 index 0000000..1f19cdd --- /dev/null +++ b/find_URLs.R @@ -0,0 +1,28 @@ +library("stringr") +library("readxl") +library("writexl") + +#Konfiguration einbinden +source("config/config.R") + +url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" + +#Excel mit den Zitaten die hinzugefügt werden sollen einlesen +df <- data.frame(read_excel(paste(PFAD_EXCEL, "distincts_kontrolliert.xlsx", sep=""))) +df <- data.frame(lapply(df, stri_enc_toutf8)) + +#Neue Spalte für die DownloadLinks erstellen +df['downloadLink'] <- NA + +#Schleife um nach Links zu suchen +for (element in 1:length(df$bibliographicCitation)){ + + url <- str_extract(df$bibliographicCitation[element], url_pattern) + + if (!is.na(url)) { + df$downloadLink[element] <- url + } + +} + +write_xlsx(df, paste(PFAD_OUT, "distincts_automatisch_mit_URL",format(Sys.time(), "%Y_%m_%d") ,".xlsx", sep="")) \ No newline at end of file