diff --git a/DESCRIPTION b/DESCRIPTION
index b4393de..abb7ecc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -45,11 +45,11 @@ Imports:
Hmisc,
igraph,
Matrix,
- magrittr,
network,
stringdist,
rworldmap,
- sna
+ sna,
+ tidygeocoder
Suggests:
covr,
gdtools,
@@ -63,7 +63,7 @@ VignetteBuilder:
Remotes:
dkahle/ggmap
Encoding: UTF-8
-RoxygenNote: 7.1.2
+RoxygenNote: 7.3.2
X-schema.org-keywords:
name disambiguation,
bibliometrics,
diff --git a/NAMESPACE b/NAMESPACE
index a1fac86..776134d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -10,10 +10,6 @@ export(plot_net_coauthor)
export(plot_net_country)
export(references_read)
importFrom(ggmap,geocode)
-importFrom(ggplot2,theme)
-importFrom(dplyr,arrange)
-importFrom(dplyr,tally)
-importFrom(magrittr, "%>%")
importFrom(network,"%v%")
importFrom(rworldmap,addMapLegend)
importFrom(stats,na.omit)
diff --git a/NEWS.md b/NEWS.md
index ea7a4d7..aab68f0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,17 +1,45 @@
# refsplitr News
+refsplitr 1.2 (2025-04-26)
+=========================
+
+### NEW FEATURES
+
+ * The new default service for georeferencing author institutions is the free
+ [Nominatim](https://nominatim.org/) service, which uses OpenStreetMap (OSM) data and
+ which `refsplitr` queries via the [`tidygeocoder`]((https://jessecambon.github.io/tidygeocoder/)
+ package.[`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/).
+ The Google Maps API is still an option, but users should be aware that their
+ georeferencing request may exceed the lower limit of free queries.
+
+ * The `authors_addresses` function has been updated and is now more efficient.
+
+ * In `plot_net_address`: the deprecated function `fortify` has been replaced
+ with `sf_convert`
+
+
refsplitr 1.0.2 (2024-08-12)
=========================
### NEW FEATURES
- * `references_read` now extracts additional fields from Web of Science records: WE (Source Database), C3 (all author affiliations, equivalent to the Scopus `affiliations` field code), EI (eISSN), OA (Open Access), and RID (the original version of the Thomson-Reuters ResearcherID (RI); authors of some older publications might have an RID but not an RI). These are not included in the default output of `references_read`, to include them use `include_all = TRUE`.
+ * `references_read` now extracts additional fields from Web of Science
+ records: WE (Source Database), C3 (all author affiliations, equivalent to the
+ Scopus `affiliations` field code), EI (eISSN), OA (Open Access), and RID
+ (the original version of the Thomson-Reuters ResearcherID (RI); authors of
+ some older publications might have an RID but not an RI). These are not
+ included in the default output of `references_read`; to include
+ them use `include_all = TRUE`.
- * `references_read` no longer extracts some rarely used field codes: GE, LT, MC, MI, and TA
+ * `references_read` no longer extracts some rarely used field codes:
+ GE, LT, MC, MI, and TA
- * The following field codes are now returned by default when using `references_read`: DT (Document Type), ID (Keywords Plus), IS (Issue), JI (ISO abbreviated source code), and NR (number of references cited by the article).
+ * The following field codes are now returned by default when using
+ `references_read`: DT (Document Type), ID (Keywords Plus), IS (Issue),
+ JI (ISO abbreviated source code), and NR (number of references cited
+ by the article).
refsplitr 1.0.1 (2024-07-23)
@@ -19,15 +47,20 @@ refsplitr 1.0.1 (2024-07-23)
### NEW FEATURES
- * output of `plot_net_country()` now includes a list of any authors that have a lat-lon but no country (called with `products$fixable_countries`).Users can correct these and re-run the visualization to include them in the graph.
+ * output of `plot_net_country()` now includes a list of any authors that have
+ a lat-lon but no country (called with `products$fixable_countries`).Users can
+ correct these and re-run the visualization to include them in the graph.
### DEPRECATED AND DEFUNCT
- * Removed the dependency on deprecated package [maptools](https://cran.r-project.org/web/packages/maptools/index.html). [(#90)](https://github.com/ropensci/refsplitr/issues/90)
+ * Removed the dependency on deprecated package
+ [maptools](https://cran.r-project.org/web/packages/maptools/index.html).
+ [(#90)](https://github.com/ropensci/refsplitr/issues/90)
### DOCUMENTATION FIXES
- * Updated README with citation of the _Journal of Open Source Software_ article describing refsplitr.
+ * Updated README with citation of the _Journal of Open Source Software_
+ article describing refsplitr.
refsplitr 0.9.0 (2020-01-14)
diff --git a/R/authors_address.R b/R/authors_address.R
index 2b74932..cea7f43 100644
--- a/R/authors_address.R
+++ b/R/authors_address.R
@@ -1,239 +1,1362 @@
#' Parses out address information and splits it into its respective parts.
-#' This is an internal function used by \code{authors_clean}
-#'
-#' \code{authors_address} This function takes the output from
+#' This is an internal function used by \code{authors_clean}. Note that parsing
+#' addresses is surprisingly difficult, largely because there is no standard
+#' format across journals/countries for how there are reported. For example:
+#' Ex 1) some journals use dept, univ, city, state, postal code, country
+#' Ex 2) others use univ, dept, country, postal code.
+#' Ex 3) Postal code is sometimes in the same cell as country, other times not.
+#'
+#' \code{authors_address} This function takes the output from
#' \code{references_read} and pulls out address information. Splitting it into
-#' university, department, city, state, etc.
+#' university, department, city, state, etc.
#' @param addresses the addresses
#' @param ID the authorID
#' @noRd
-authors_address <- function(addresses, ID){
- message("\nSplitting addresses\n")
+authors_address <- function(addresses, ID) {
+ addresses <- tolower(addresses)
+ message("\nSplitting addresses.\n")
+
list_address <- strsplit(addresses, ",")
- university_list <- vapply(list_address, function(x) x[1], character(1))
- country_list <- vapply(list_address, function(x) {
- gsub("\\_", "", x[length(x)]) },
- character(1))
- country_list <- trimws(country_list, which = "both")
- pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("USA",
- country_list), function(x) x[1], numeric(1))) - 1), which = "right")
- state_list <- pc_list
- state_list[nchar(state_list) > 0] <- regmatches(
- state_list[nchar(state_list) > 0],
- regexpr("[[:upper:]]{2}", state_list[nchar(state_list) > 0])
+
+
+ # remove punctuation ----------------------------------------
+
+ ## First remove periods and trim white space from countries.
+ ## helps avoids mistakes later on
+
+ remove_period_from_last <- function(list_address) {
+ lapply(list_address, function(x) {
+ if (length(x) > 0) {
+ x[length(x)] <- gsub("\\.$", "", x[length(x)])
+ x[length(x)] <- trimws(x[length(x)], which = "both")
+ }
+ return(x)
+ })
+ }
+
+ list_address <- remove_period_from_last(list_address)
+
+ # trim ws -----------------------------------------------------------------
+
+ list_address <- lapply(list_address, trimws)
+
+ # correct countries -------------------------------------------------------
+
+
+ # format or update names of some countries to make it possible to georef
+ # the WOS often uses abbreviations, this standardizes them in a way that
+ # `tidygeocoder` can use them. It also updates country names that have changed
+ # (e.g., czechia is new name for czech republic). In some cases no changes
+ # were made (e.g., united arab rep = current country name depends on city).
+
+ # Define the function
+ correct_countries <- function(my_list, replacements) {
+ # Loop through each element of the list
+ for (i in 1:length(my_list)) {
+ # Get the length of the current element
+ len <- length(my_list[[i]])
+
+ # Check if the last item matches any of the target words
+ if (len > 0 && my_list[[i]][len] %in% names(replacements)) {
+ # Replace the last item with the corresponding replacement word
+ my_list[[i]][len] <- replacements[[my_list[[i]][len]]]
+ }
+ }
+ return(my_list)
+ }
+
+ replacements <- c(
+ "austl" = "australia",
+ "c z" = "czechia",
+ "cz" = "czechia",
+ "czech republic" = "czechia",
+ "fed rep ger" = "germany",
+ "columbia" = "colombia", # a depressingly common mistake
+ "peoples r china" = "china",
+ "u arab emirates" = "united arab emirates",
+ "mongol peo rep" = "mongolia",
+ "dominican rep" = "dominican republic",
+ "fr polynesia" = "french polynesia",
+ "neth antilles" = "netherland antilles",
+ "trinid & tobago" = "trinidad & tobago",
+ "rep congo" = "congo",
+ "north ireland" = "northern ireland",
+ "syrian arab rep" = "syria"
)
- pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2],
- regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2]))
- pc_list[nchar(pc_list) < 3] <- ""
- country_list <- ifelse(grepl("USA", country_list), "USA", country_list)
+ message("\nstandardizing country names...\n")
+
+ list_address <- correct_countries(list_address, replacements)
+
+ # extract university ------------------------------------------------------
+
+ message("\nextracting the names of institutions...\n")
+
+ university_list <- vapply(list_address, function(x) x[1], character(1))
+
+ # extract department ------------------------------------------------------
+
+ # If department is listed it is typically second
+ # (EB note: only if 4+ slots)
+ # this will be 2x checked later
+
+ dept_extract <- function(x) {
+ if (length(x) < 4) {
+ return(NA)
+ } else {
+ return(trimws(x[[2]]))
+ }
+ }
+
+ dept_list <- unlist(lapply(list_address, dept_extract))
+
+ dept_list <- trimws(dept_list, which = "both")
+
+
+ # Extract City ------------------------------------------------------------
+
+ message("\nextracting cities...\n")
+
+ # If there is only one element, then it can't have both city and country'
+ city_list <- vapply(list_address, function(x) {
+ n <- length(x)
+ if (n == 1) {
+ return("no city") # placeholder to replace with NA after function
+ }
+
+ # In some cases city is next-to-last element, in others next-to-next-to-last
+ last_element <- x[[n]]
+ second_last <- if (n > 1) x[[n - 1]] else NA
+ third_last <- if (n > 2) x[[n - 2]] else NA
+
+ # Default case
+ return(second_last)
+ }, character(1))
+
+ # Cleanup
+ city_list <- trimws(city_list, which = "both")
+ city_list[city_list == "no city"] <- NA
- list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))])
- # Because formats of address printing is different across platforms
- # We are going to split using a tier system assuming first and last
- # info is somewhat reliable and guess the other info from the
- # remaining position of the info
+ # extract state -----------------------------------------------------------
+ message("\nextracting states/provinces...\n")
+
+ # If there is only one element, then it can't have both city and country'
+ state_list <- vapply(list_address, function(x) {
+ n <- length(x)
+ if (n == 1) {
+ return("no state") # placeholder to replace with NA after function
+ }
+
+ # In some cases city is next-to-last element, in others next-to-next-to-last
+ last_element <- x[[n]]
+ second_last <- if (n > 1) x[[n - 1]] else NA
+ third_last <- if (n > 2) x[[n - 2]] else NA
- second_tier_list <- lapply(list_address1, function(x) x[length(x)])
- second_tier_list <- trimws(second_tier_list, which = "both")
- second_tier_list[second_tier_list == "character(0)"] <- NA
+ # Default case
+ return(third_last)
+ }, character(1))
- list_address2 <- lapply(list_address1, function(x) x[-c(length(x))])
+ # Cleanup
+ state_list <- trimws(state_list, which = "both")
+ state_list[state_list == "no state"] <- NA
- third_tier_list <- lapply(list_address2, function(x) x[length(x)])
- third_tier_list <- trimws(third_tier_list, which = "both")
- third_tier_list[third_tier_list == "character(0)"] <- NA
+ # this is used to double check later - sometimes city is extracted as state
+ city_list2 <- trimws(state_list, which = "both")
- # All remaining info is just shoved in this category
- remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1])
- remain_list <- trimws(remain_list, which = "both")
- remain_list[remain_list == "character(0)"] <- NA
+ # Extract Country ---------------------------------------------------------
+
+ message("\nextracting country...\n")
+
+ country_list <- vapply(
+ list_address, function(x) {
+ gsub("\\_", "", x[length(x)])
+ },
+ character(1)
+ )
+
+
+ # postal code (pc) list ---------------------------------------------------
+
+ message("\nprocessing postal codes...\n")
+
+ # pc often with city
+
+ pc_list <- city_list
+
+ # bind all into df --------------------------------------------------------
+
+ message("\nreview, correction, and clean-up...\n")
+ message("\nPlease be patient - this might take a bit.\n")
a_df <- data.frame(
- adID = ID, university = university_list,
+ adID = ID,
+ university = university_list,
country = country_list,
- state = state_list, postal_code = pc_list, city = NA,
- department = NA, second_tier = second_tier_list,
- third_tier = third_tier_list,
- remain = remain_list, address = addresses,
+ state = state_list,
+ postal_code = pc_list,
+ city = city_list,
+ city2 = city_list2,
+ department = dept_list,
+ address = addresses,
stringsAsFactors = FALSE
)
- # try to fix the USA spots, which vary in format than other countries
- a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0]
- a_df$state[nchar(a_df$state) == 0] <- NA
- a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA
- a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) &
- !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) &
- !is.na(a_df$postal_code) & !is.na(a_df$state)]
- # fix a US problem when USA is not tacked onto the end
-
- us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}"
- a_df$state[ grepl(us_reg, a_df$country) ] <-
- substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2)
-
- a_df$postal_code[ grepl(us_reg, a_df$country) ] <-
- substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8)
-
- a_df$country[grepl(us_reg, a_df$country)] <- "USA"
-
- ##########################
- # We'll use regular expression to pull zipcodes
- # These formats differ by region
- int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}"
- int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]",
- "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="")
- int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}"
- int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}"
- int <- paste(int1, int2, int3, int4, sep = "|")
-
- UK <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]",
- "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="")
-
- Mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well
-
- Panama <- "[[:digit:]]{4}-[[:digit:]]{5}"
-
- zip_search <- paste0(int, "|", UK, "|", Mexico, "|", Panama)
-
- ###########################
- id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) &
- a_df$address != "Could not be extracted"]
- ###########################
-
- # We now iteratively run through the addresses using the concept that
- # certain information always exists next to each other.
- # Ex. city, state, country tend to exist next to each other.
- # We use the position of the zipcode also to help guide us
- # in where the information lies as well as how many fields were
- # given to us.
- for (i in id_run) {
- found <- FALSE
- row <- which(a_df$adID == i)
- university <- a_df$university[row]
- second_tier <- a_df$second_tier[row]
- third_tier <- a_df$third_tier[row]
- remain <- a_df$remain[row]
- city <- NA
- state <- NA
- postal_code <- NA
- department <- NA
- grepl(zip_search, second_tier)
- grepl(zip_search, third_tier)
- # 2nd tier
- if (grepl(zip_search, second_tier)) {
- found <- TRUE
- postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier))
- city <- gsub(zip_search, "", second_tier)
- department <- ifelse(is.na(remain), third_tier, remain)
- }
- # 3RD tiers
- if (grepl(zip_search, third_tier) & !found) {
- found <- TRUE
- postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier))
- city <- gsub(zip_search, "", third_tier)
- state <- second_tier
- department <- remain
- }
- if (!found) {
- state <- second_tier
- city <- third_tier
- department <- remain
- }
- # To make university searching more efficient we'll override values
- # based on if it has university/college in the name,
- # where university overides college
- override_univ <- grepl("\\buniv\\b|\\buniversi",
- c(second_tier, third_tier, remain, city, university),
- ignore.case = TRUE) &
- !grepl("\\bdrv\\b|\\bdrive\\b",
- c(second_tier, third_tier, remain, city, university),
- ignore.case = TRUE)
-
- if (any(override_univ)) {
- university <-
- c(second_tier, third_tier, remain, city, university)[override_univ][1]
- assign(
- c("second_tier", "third_tier", "remain", "city", "university")[
- override_univ][1],
- NA
- )
- }
- # only if university doesnt already exist
- override_univ_col <-
- grepl("\\bcol\\b|college|\\bcoll\\b",
- c(second_tier, third_tier, remain, city, university),
- ignore.case = TRUE) &
- !grepl("\\bdrv\\b|\\bdrive\\b",
- c(second_tier, third_tier, remain, city, university),
- ignore.case = TRUE)
-
- if (!any(override_univ) & any(override_univ_col)) {
- university <-
- c(second_tier, third_tier, remain, city, university )[
- override_univ_col][1]
-
- assign(
- c("second_tier", "third_tier", "remain", "city", "university")[
- override_univ_col][1],
- NA
- )
- }
- # more risky, but institutions as well, just incase its not a university
- override_univ_inst <- grepl("\\binst\\b|\\binstitut",
- c(second_tier, third_tier, remain, city, university),
- ignore.case = TRUE)
- if (
- !any(override_univ) & !any(override_univ_col) & any(override_univ_inst)
- ) {
- department <- c(second_tier, third_tier, remain, city, university )[
- override_univ_inst][1]
-
- assign(
- c("second_tier", "third_tier", "remain", "city", "university")[
- override_univ_inst][1],
- NA
- )
- }
- a_df$city[row] <- gsub("[[:digit:]]", "", city)
- a_df$state[row] <- gsub("[[:digit:]]", "", state)
- a_df$postal_code[row] <- postal_code
- a_df$department[row] <- department
+ # any PC without numbers gets NA'd
+ a_df$postal_code[!grepl("\\d", a_df$postal_code)] <- NA
+
+ # copy over PC and state
+ a_df$state <- ifelse(grepl("usa", a_df$country) & nchar(a_df$state) > 2,
+ NA,
+ a_df$state
+ )
+
+
+ a_df$postal_code <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country),
+ a_df$country, a_df$postal_code
+ )
+
+ a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country),
+ a_df$country, a_df$state
+ )
+
+ a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5}", a_df$city),
+ a_df$city, a_df$state
+ )
+
+
+ a_df$state <- ifelse(grepl("[a-z]{2} usa", a_df$country),
+ a_df$country, a_df$state
+ )
+
+ # remove the numbers and letters as appropriate
+
+
+ a_df$country <- ifelse(grepl(" usa", a_df$country),
+ "usa", a_df$country
+ )
+
+ a_df$state <- ifelse(a_df$country == "usa" & grepl(
+ "[a-z]{2} [0-9]{5}",
+ a_df$state
+ ),
+ gsub("[[:digit:]]{5}", "", a_df$state),
+ a_df$state
+ )
+
+ a_df$state <- ifelse(a_df$country == "usa" & grepl(" usa", a_df$state),
+ gsub(" usa", "", a_df$state),
+ a_df$state
+ )
+
+
+ a_df$postal_code <- ifelse(a_df$country == "usa",
+ gsub(
+ "[[:alpha:]]{2} ", "",
+ a_df$postal_code
+ ), a_df$postal_code
+ )
+
+ a_df$postal_code <- ifelse(a_df$country == "usa",
+ gsub(
+ " usa", "",
+ a_df$postal_code
+ ), a_df$postal_code
+ )
+
+
+
+
+ a_df$city <- ifelse(a_df$country == "usa" & grepl(
+ "[a-z]{2} [0-9]{5}",
+ a_df$city
+ ),
+ a_df$city2,
+ a_df$city
+ )
+
+
+ pattern <- "[a-z]{2} [0-9]{5}"
+
+ a_df$postal_code <- ifelse(grepl(pattern, a_df$country),
+ a_df$country, a_df$postal_code
+ )
+ a_df$state <- ifelse(grepl(pattern, a_df$country),
+ a_df$country, a_df$state
+ )
+ a_df$country <- ifelse(grepl(pattern, a_df$country),
+ "usa", a_df$country
+ )
+ a_df$postal_code <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$postal_code),
+ gsub("[a-z]", "", a_df$postal_code),
+ a_df$postal_code
+ )
+ a_df$state <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$state),
+ gsub(
+ "[0-9]",
+ "",
+ a_df$postal_code
+ ),
+ a_df$state
+ )
+
+
+
+ # BRAZIL clean-up ---------------------------------------------------------
+
+ a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2,
+ a_df$city,
+ a_df$state
+ )
+ a_df$city <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2,
+ a_df$city2,
+ a_df$city
+ )
+ a_df$city2 <- ifelse(a_df$country == "brazil" & a_df$city == a_df$city2,
+ NA,
+ a_df$city2
+ )
+ a_df$postal_code <- ifelse(a_df$country == "brazil" & is.na(a_df$postal_code),
+ a_df$city,
+ a_df$postal_code
+ )
+ a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$state) > 2,
+ NA,
+ a_df$state
+ )
+
+ a_df$postal_code <- ifelse(a_df$country == "brazil",
+ gsub(
+ "[A-Za-z]",
+ "",
+ a_df$postal_code
+ ),
+ a_df$postal_code
+ )
+ a_df$postal_code <- ifelse(a_df$country == "brazil",
+ gsub(
+ "[-]",
+ "",
+ a_df$postal_code
+ ),
+ a_df$postal_code
+ )
+
+ a_df$city <- ifelse(a_df$country == "brazil",
+ gsub(
+ "br-",
+ "",
+ a_df$city
+ ),
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "brazil",
+ gsub(
+ "[0-9]",
+ "",
+ a_df$city
+ ),
+ a_df$city
+ )
+
+
+ a_df$state <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2),
+ a_df$city,
+ a_df$state
+ )
+ a_df$postal_code <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2),
+ a_df$city2,
+ a_df$postal_code
+ )
+ a_df$city <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2),
+ a_df$city2,
+ a_df$city
+ )
+
+ message("\n(still working on it...)\n")
+
+ # repeat the clean of city
+ a_df$city <- ifelse(a_df$country == "brazil",
+ gsub("br-", "", a_df$city),
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "brazil",
+ gsub("[0-9]", "", a_df$city),
+ a_df$city
+ )
+
+
+ a_df$postal_code <- ifelse(a_df$country == "brazil",
+ gsub("[A-Za-z]", "", a_df$postal_code),
+ a_df$postal_code
+ )
+ a_df$postal_code <- ifelse(a_df$country == "brazil",
+ gsub("[-]", "", a_df$postal_code),
+ a_df$postal_code
+ )
+
+ a_df[] <- lapply(a_df, trimws)
+
+
+ # Define city-to-state mapping
+ city_state_mapping <- data.frame(
+ city = c(
+ "ribeirao preto", "sao carlos", "rio claro", "sorocaba",
+ "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"
+ ),
+ state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"),
+ stringsAsFactors = FALSE
+ )
- #########################Clock###############################
- total <- length(id_run)
- pb <- utils::txtProgressBar(min = 0, max = total, style = 3)
- utils::setTxtProgressBar(pb, which(id_run == i))
- #############################################################
+ # Match cities and states
+ for (i in 1:nrow(city_state_mapping)) {
+ a_df$city <- ifelse(a_df$country == "brazil" &
+ grepl(city_state_mapping$city[i],
+ a_df$state,
+ ignore.case = TRUE
+ ),
+ city_state_mapping$city[i], a_df$city
+ )
+ a_df$state <- ifelse(a_df$country == "brazil" &
+ grepl(city_state_mapping$city[i],
+ a_df$state,
+ ignore.case = TRUE
+ ),
+ city_state_mapping$state[i], a_df$state
+ )
}
- city_fix <- is.na(a_df$city) & !is.na(a_df$state)
- a_df$city[city_fix] <- a_df$state[city_fix]
- a_df$state[city_fix] <- NA
- a_df$university[a_df$university == "Could not be extracted"] <- NA
- a_df$country[a_df$country == "Could not be extracted"] <- NA
- a_df$country[a_df$country == "Peoples R China"] <- "China"
- a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <-
- vapply(strsplit(
- a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)],
- "-"),
- function(x) x[2], character(1)
+ # Match cities and states
+ for (i in 1:nrow(city_state_mapping)) {
+ a_df$state <- ifelse(a_df$country == "brazil" &
+ grepl(city_state_mapping$city[i], a_df$city, ignore.case = TRUE),
+ city_state_mapping$state[i],
+ a_df$state
)
- #strip periods from the ends of city,state,country
- a_df$city <- gsub("\\.", "", a_df$city)
- a_df$state <- gsub("\\.", "", a_df$state)
- a_df$country <- gsub("\\.", "", a_df$country)
- a_df$country[a_df$country == ""] <- NA
- a_df$university[a_df$university == ""] <- NA
- a_df$postal_code[a_df$postal_code == ""] <- NA
- #convert to lower
- for (l in 2:ncol(a_df)){
- a_df[, l] <- tolower(a_df[, l])
}
+
+ # AUSTRALIA clean-up---------------------------------------------------------------
+
+ a_df$state <- ifelse(a_df$country == "australia",
+ a_df$city, a_df$state
+ )
+ a_df$postal_code <- ifelse(a_df$country == "australia",
+ a_df$city, a_df$postal_code
+ )
+ a_df$city <- ifelse(a_df$country == "australia",
+ a_df$city2, a_df$city
+ )
+ a_df$city2 <- ifelse(a_df$country == "australia",
+ NA, a_df$city2
+ )
+
+
+ a_df$postal_code <- ifelse(a_df$country == "australia",
+ gsub("[A-Za-z]", "", a_df$postal_code),
+ a_df$postal_code
+ )
+ a_df$state <- ifelse(a_df$country == "australia",
+ gsub("[0-9]", "", a_df$state),
+ a_df$state
+ )
+
+ a_df[] <- lapply(a_df, trimws)
+
+
+
+ # CANADA clean-up ---------------------------------------------------------
+
+ a_df$state <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2,
+ a_df$city,
+ a_df$state
+ )
+
+ a_df$city <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2,
+ NA,
+ a_df$city
+ )
+
+ a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2),
+ a_df$city2,
+ a_df$postal_code
+ )
+ a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2),
+ a_df$city2,
+ a_df$postal_code
+ )
+
+ a_df$state <- ifelse(a_df$country == "canada" & a_df$city2 == a_df$state,
+ NA, a_df$state
+ )
+
+ a_df$city <- ifelse(a_df$country == "canada",
+ a_df$city2,
+ a_df$city
+ )
+ a_df$city2 <- ifelse(a_df$country == "canada",
+ NA,
+ a_df$city2
+ )
+
+
+ a_df$city <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city),
+ gsub(
+ "\\b(\\w{3})\\b \\b(\\w{3})\\b",
+ "",
+ a_df$city
+ ),
+ a_df$city
+ )
+
+ a_df$state <- ifelse(a_df$country == "canada" & is.na(a_df$state),
+ a_df$postal_code,
+ a_df$state
+ )
+
+ a_df$state <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$state),
+ gsub(
+ "\\b(\\w{3})\\b \\b(\\w{3})\\b",
+ "",
+ a_df$state
+ ),
+ a_df$state
+ )
+
+ a_df$postal_code <- ifelse(a_df$country == "canada" &
+ grepl(
+ "\\b(\\w{2,20})\\b \\b(\\w{3})\\b \\b(\\w{3})\\b",
+ a_df$postal_code
+ ),
+ gsub(
+ "\\b(\\w{1,2}|\\w{4,})\\b",
+ "",
+ a_df$postal_code
+ ),
+ a_df$postal_code
+ )
+
+ a_df[] <- lapply(a_df, trimws)
+
+ # TODO: a few postal codes still have letters from city
+
+
+ a_df$postal_code <- ifelse(a_df$country == "canada",
+ gsub(" ", "", a_df$postal_code),
+ a_df$postal_code
+ )
+
+
+ # UK clean-up -------------------------------------------------------------
+
+ uk <- c("scotland", "england", "wales", "northern ireland")
+ pattern <- "[a-z0-9]{2,4} [a-z0-9]{3,4}"
+ #
+ # a_df$postal_code <- ifelse(a_df$country %in% uk &
+ # grepl(pattern, a_df$city2),a_df$city2,
+ # a_df$postal_code)
+
+ a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city2),
+ a_df$city2,
+ a_df$postal_code
+ )
+ a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$state),
+ a_df$state,
+ a_df$postal_code
+ )
+ a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city),
+ a_df$city,
+ a_df$postal_code
+ )
+
+ a_df$postal_code <- ifelse(a_df$country %in% uk,
+ ifelse(!grepl("\\d", a_df$postal_code), NA, a_df$postal_code),
+ a_df$postal_code
+ )
+
+ a_df$city <- ifelse(a_df$country %in% uk & a_df$city == a_df$postal_code,
+ NA, a_df$city
+ )
+
+ a_df$state <- ifelse(a_df$country %in% uk & a_df$state == a_df$postal_code,
+ NA, a_df$state
+ )
+
+
+ a_df$state <- ifelse(a_df$country == "england",
+ a_df$city,
+ a_df$state
+ )
+ a_df$city <- ifelse(a_df$country == "england",
+ NA,
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "england",
+ a_df$postal_code,
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "england",
+ gsub("\\b\\w*\\d\\w*\\b", "", a_df$city),
+ a_df$city
+ )
+
+ message("\n(getting closer...)\n")
+
+ # TODO: england still needs work
+
+ a_df$state <- ifelse(a_df$country == "scotland" |
+ a_df$country == "northern ireland" |
+ a_df$country == "wales",
+ NA,
+ a_df$state
+ )
+ a_df$state <- ifelse(a_df$country == "scotland" |
+ a_df$country == "northern ireland" |
+ a_df$country == "wales" &
+ is.na(a_df$state),
+ a_df$city,
+ a_df$state
+ )
+ a_df$city <- ifelse(a_df$country == "scotland" |
+ a_df$country == "northern ireland" |
+ a_df$country == "wales",
+ a_df$postal_code,
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "scotland" |
+ a_df$country == "northern ireland" |
+ a_df$country == "wales" &
+ is.na(a_df$city),
+ a_df$city2,
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "scotland" |
+ a_df$country == "northern ireland" |
+ a_df$country == "wales",
+ gsub(
+ "\\b\\w*\\d\\w*\\b",
+ "",
+ a_df$city
+ ),
+ a_df$city
+ )
+
+
+ # postal codes clean uk ---------------------------------------------------
+
+
+ # Define the function
+ keep_numerical_parts <- function(df, control_col, country, target_col) {
+ # Apply the function to each row using sapply or a loop
+ df[[target_col]] <- sapply(1:nrow(df), function(i) {
+ if (df[[control_col]][i] == country) {
+ # Use gregexpr to find all parts of the string that include a numeral
+ matches <- gregexpr("\\b\\S*\\d\\S*\\b", df[[target_col]][i])
+ # Extract the matched parts
+ result <- regmatches(df[[target_col]][i], matches)
+ # Combine the matched parts into a single string
+ result <- unlist(result)
+ result <- paste(result, collapse = " ")
+ result <- gsub(" ", "", result)
+ return(result)
+ } else {
+ return(df[[target_col]][i])
+ }
+ })
+
+ return(df)
+ }
+
+
+ a_df <- keep_numerical_parts(a_df, "country", "scotland", "postal_code")
+ a_df <- keep_numerical_parts(a_df, "country", "england", "postal_code")
+ a_df <- keep_numerical_parts(a_df, "country", "northern ireland", "postal_code")
+ a_df <- keep_numerical_parts(a_df, "country", "wales", "postal_code")
+
+
+
+
+
+
+
+ # INDIA clean-up ----------------------------------------------------------
+
+
+ a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city2),
+ a_df$city2,
+ a_df$postal_code
+ )
+ a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city),
+ a_df$city,
+ a_df$postal_code
+ )
+
+
+ a_df$city2 <- ifelse(a_df$country == "india" & a_df$state == a_df$city2,
+ a_df$state,
+ a_df$city2
+ )
+ a_df$state <- ifelse(a_df$country == "india", NA, a_df$state)
+ a_df$state <- ifelse(a_df$country == "india" & is.na(a_df$postal_code),
+ a_df$city,
+ a_df$state
+ )
+ a_df$city <- ifelse(a_df$country == "india" & a_df$state == a_df$city,
+ NA,
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "india" & grepl("[0-9]{4,10}", a_df$postal_code),
+ a_df$postal_code,
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "india" & is.na(a_df$city),
+ a_df$city2,
+ a_df$city
+ )
+
+
+ a_df$postal_code <- ifelse(a_df$country == "india",
+ gsub("[A-Za-z]", "", a_df$postal_code),
+ a_df$postal_code
+ )
+ a_df$city <- ifelse(a_df$country == "india",
+ gsub("[0-9]", "", a_df$city),
+ a_df$city
+ )
+
+ a_df$city <- ifelse(a_df$country == "india" &
+ (grepl("delhi", a_df$city) | grepl("delhi", a_df$state)),
+ "new delhi",
+ a_df$city
+ )
+
+
+ # CHINA clean-up ----------------------------------------------------------
+
+
+
+
+
+ a_df$postal_code <- ifelse(a_df$country == "china" &
+ grepl("[0-9]{5,10}", a_df$city2),
+ a_df$city2,
+ a_df$postal_code
+ )
+ a_df$postal_code <- ifelse(a_df$country == "china" &
+ grepl("[0-9]{5,10}", a_df$city),
+ a_df$city,
+ a_df$postal_code
+ )
+ a_df$postal_code <- ifelse(a_df$country == "china" &
+ grepl("[0-9]{5,10}", a_df$state),
+ a_df$state,
+ a_df$postal_code
+ )
+
+
+ a_df$city2 <- ifelse(a_df$country == "china" & a_df$state == a_df$city2,
+ a_df$state,
+ a_df$city2
+ )
+ a_df$state <- ifelse(a_df$country == "china",
+ NA,
+ a_df$state
+ )
+ a_df$state <- ifelse(a_df$country == "china" & is.na(a_df$postal_code),
+ a_df$city,
+ a_df$state
+ )
+ a_df$city <- ifelse(a_df$country == "china" & a_df$state == a_df$city,
+ NA,
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "china" & grepl("[0-9]{4,10}", a_df$postal_code),
+ a_df$postal_code,
+ a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "china" & is.na(a_df$city),
+ a_df$city2,
+ a_df$city
+ )
+
+
+ a_df$postal_code <- ifelse(a_df$country == "china",
+ gsub("[A-Za-z]", "", a_df$postal_code),
+ a_df$postal_code
+ )
+ a_df$city <- ifelse(a_df$country == "china",
+ gsub("[0-9]", "", a_df$city),
+ a_df$city
+ )
+
+
+
+ a_df$city <- ifelse(a_df$country == "china" & grepl("beijing", a_df$state),
+ "beijing",
+ a_df$city
+ )
+
+
+ # Define words indicating this is actually a dept not state or postal code
+ # will use this list to delete the ones that don't apply
+ to_delete <- c(
+ "&", "inst", "ctr", "med", "chem", "lab", "biol",
+ "dept", "div", "univ", "hosp", "coll", "sci", "rd",
+ "program", "minist", "educ", "sch ", "grad ", "fac ",
+ "assoc", "forest"
+ )
+
+
+ pattern <- paste(to_delete, collapse = "|")
+ # Apply the ifelse function to update
+ a_df$city <- ifelse(a_df$country == "china" &
+ grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE),
+ NA, a_df$city
+ )
+ a_df$city <- ifelse(a_df$country == "china" & is.na(a_df$city),
+ a_df$state, a_df$city
+ )
+
+
+
+ a_df[] <- lapply(a_df, trimws)
+
+
+ message("\n(not much longer...)\n")
+
+ # This verifies that what is in `city` is actually a city
+ # (or at least that what is in `city` is NOT a province)
+
+ chn_states <- c(
+ "guangdong", "shandong", "henan", "jiangsu", "sichuan",
+ "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi",
+ "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi",
+ "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang",
+ "gansu", "inner mongolia", "jilin", "hainan", "ningxia",
+ "qinghai", "tibet", "macao"
+ )
+ pattern <- paste(to_delete, collapse = "|")
+ a_df$city <- ifelse(a_df$country == "china" &
+ grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE),
+ NA, a_df$city
+ )
+
+
+
+ # pc is letters dash numbers ----------------------------------------------
+
+
+ pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b"
+
+ a_df$postal_code <- ifelse(grepl(pattern, a_df$city),
+ a_df$city, a_df$postal_code
+ )
+
+ a_df$postal_code <- ifelse(grepl(pattern, a_df$state),
+ a_df$state,
+ a_df$postal_code
+ )
+
+ a_df$state <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code),
+ a_df$city,
+ a_df$state
+ )
+
+
+ a_df$city <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code),
+ a_df$postal_code,
+ a_df$city
+ )
+
+ a_df$city2 <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$city),
+ NA, a_df$city2
+ )
+
+
+
+ a_df$city <- ifelse(grepl(pattern, a_df$city),
+ gsub("[0-9]", "", a_df$city),
+ a_df$city
+ )
+ a_df$city <- gsub("[a-z]{1,2}- ", "", a_df$city)
+
+
+ a_df$city <- gsub("[-]", "", a_df$city)
+ a_df[] <- lapply(a_df, trimws)
+
+
+ pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b"
+
+ a_df$postal_code <- ifelse(grepl(pattern, a_df$postal_code),
+ gsub("[a-z]", "", a_df$postal_code),
+ a_df$postal_code
+ )
+
+ a_df$postal_code <- gsub("[-]", "", a_df$postal_code)
+ a_df[] <- lapply(a_df, trimws)
+
+
+ # final check of postal codes (consecutive nos.) --------------------------
+
+
+ # Define the function
+ extract_consecutive_numbers <- function(df, source, destination) {
+ df[[destination]] <- sapply(1:nrow(df), function(i) {
+ # Use gregexpr to find sequences of 4 or more consecutive numbers
+ if (is.na(df[[destination]][i])) {
+ matches <- gregexpr("\\d{4,}", df[[source]][i])
+ # Extract the matched sequences
+ result <- regmatches(df[[source]][i], matches)
+ # Flatten the list of matches into a character vector
+ result <- unlist(result)
+ # Combine the matched sequences into a single string
+ result <- paste(result, collapse = " ")
+ return(result)
+ } else {
+ return(df[[destination]][i])
+ }
+ })
+ return(df)
+ }
+
+ a_df <- extract_consecutive_numbers(a_df, "state", "postal_code")
+
+
+
+ # clean the city ----------------------------------------------------------
+
+ # remove any digits
+
+ a_df$city <- gsub("[0-9]", "", a_df$city)
+
+
+
+ # clean up postal code ----------------------------------------------------
+
+
+ a_df$postal_code <- ifelse(grepl("\\b[a-zA-Z]+\\s+[0-9]+\\b", a_df$postal_code),
+ gsub("\\b[a-zA-Z]+\\s", "", a_df$postal_code),
+ a_df$postal_code
+ )
+
+ # NETHERLANDS clean-up ----------------------------------------------------
+ # cities often have two characters at start (ascii version of ligature/dipthong)
+
+ a_df[] <- lapply(a_df, trimws)
+ a_df$city <- ifelse(a_df$country == "netherlands" & grepl("^[a-zA-Z]{2} ", a_df$city),
+ (sub("^[a-zA-Z]{2} ", "", a_df$city)), a_df$city
+ )
+
+ a_df[] <- lapply(a_df, trimws)
+
+
+
+ # Final clean-up of some US cities and states -----------------------------
+
+ a_df$city <- ifelse(a_df$city == "university pk",
+ "university park",
+ a_df$city
+ )
+
+ a_df$city <- ifelse(a_df$city == "college stn",
+ "college station",
+ a_df$city
+ )
+
+ a_df$city <- ifelse(a_df$city == "n chicago",
+ "north chicago",
+ a_df$city
+ )
+
+ a_df$city <- ifelse(a_df$city == "college pk",
+ "college park",
+ a_df$city
+ )
+
+ a_df$city <- ifelse(a_df$city == "research triangle pk" | a_df$city == "res triangle pk",
+ "research triangle park",
+ a_df$city
+ )
+
+ a_df$city <- ifelse(a_df$city == "state coll",
+ "state college",
+ a_df$city
+ )
+
+
+
+ a_df$city <- ifelse(grepl("sioux ctr", a_df$city),
+ (sub("sioux ctr", "sioux city", a_df$city)),
+ a_df$city
+ )
+
+
+
+ a_df$city <- ifelse(grepl("sioux ctr", a_df$city),
+ (sub("sioux ctr", "sioux city", a_df$city)),
+ a_df$city
+ )
+
+
+ # Final clean-up of some Brazil cities and states -------------------------
+
+ message("\n(almost done...)\n")
+
+ a_df$city <- ifelse(a_df$country=="brazil" & grepl("seropedica", a_df$city),
+ "seropedica",
+ a_df$city
+ )
+
+ a_df$city <- ifelse(a_df$country == "brazil" & a_df$city == "gavea rio de janeiro",
+ "rio de janeiro",
+ a_df$city
+ )
+
+
+ a_df$city <- ifelse((a_df$country == "brazil" & a_df$city == "s jose campos"),
+ "sao jose dos campos",
+ a_df$city
+ )
+
+ a_df$city <- ifelse((a_df$country == "brazil" & (a_df$city == "rio de janerio" |
+ a_df$city == "rio de janiero" |
+ a_df$city == "rio der janeiro" |
+ a_df$city == "rio janeiro" |
+ a_df$city == "rio janiero")),
+ "rio de janeiro",
+ a_df$city
+ )
+
+ # Final clean-up of some INDIA cities and states --------------------------
+
+
+
+ a_df$city <- ifelse((a_df$city == "dehra dun" & a_df$country == "india"),
+ "dehradun",
+ a_df$city
+ )
+
+
+ # Final clean-up of some CANADA cities and states -------------------------
+
+
+ a_df$city <- ifelse((a_df$city == "st john" & a_df$country == "canada"),
+ "st. john's",
+ a_df$city
+ )
+
+
+ # Final clean-up of some UK cities and states -----------------------------
+
+
+ a_df$state <- ifelse(a_df$state == "london ",
+ "london",
+ a_df$state
+ )
+
+ a_df$city <- ifelse((a_df$state == "london" & a_df$country == "england"),
+ "london",
+ a_df$city
+ )
+
+
+ # final clean-up of some MEXICO cities and states -------------------------
+
+
+ a_df$city <- ifelse(a_df$country == "mexico" & a_df$city == "df",
+ "mexico city",
+ a_df$city
+ )
+
+
+ # final clean-up of some ARGENTINA cities and states ----------------------
+
+
+
+ a_df$city <- ifelse(a_df$country == "argentina" & a_df$city == "df",
+ "buenos aires", a_df$city
+ )
+
+
+ # final clean up of some ABBREVIATIONS in city names ----------------------
+
+
+ a_df$city <- ifelse(grepl("^st ", a_df$city),
+ (sub("^st ", "saint ", a_df$city)), a_df$city
+ )
+
+ a_df$city <- ifelse(grepl(" st ", a_df$city),
+ (sub(" st ", " saint ", a_df$city)), a_df$city
+ )
+
+ a_df$city <- ifelse(grepl("^ste ", a_df$city),
+ (sub("^ste ", "saint ", a_df$city)), a_df$city
+ )
+
+
+
+ # removing departments etc allocated to city or state ---------------------
+
+ # use strings of words typical of institutions or departmewnts to remove
+
+ tech_words <- c(
+ " lab ", "lab ", " lab", "dept", "hosp", " inst", "inst ", "ctr",
+ "unit", "ltd", "minist", "educ", "grad ", " sch ", "sch ", " sch",
+ "coll ", " sci ", "natl", "&", " med", "med ",
+ "publ", "dept", "biomed", "phys", "technol",
+ "engn"
+ )
+ pattern <- paste(tech_words, collapse = "|")
+
+ a_df$city <- ifelse((a_df$city != "esch sur alzette" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)),
+ a_df$state, a_df$city
+ )
+
+
+ a_df$state <- ifelse(a_df$state == a_df$city2, NA, a_df$state)
+
+ a_df$state <- ifelse(grepl("[[:digit:]]", a_df$state),
+ NA, a_df$state
+ )
+
+ a_df$state <- ifelse(a_df$state == "", NA, a_df$state)
+
+ a_df$postal_code <- ifelse(a_df$postal_code == "", NA, a_df$postal_code)
+
+
+
+ # still some us states not extracting properly but fixed here -------------
+
+ message("\n(so close...the end is in sight!)\n")
+
+ us_state_abbreviations_lower <- c(
+ "al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga",
+ "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md",
+ "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj",
+ "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc",
+ "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy"
+ )
+ pattern <- paste(us_state_abbreviations_lower, collapse = "|")
+ a_df$country_list <- country_list
+ a_df$state <- ifelse((a_df$country == "usa" &
+ is.na(a_df$state) &
+ grepl(pattern, a_df$address, ignore.case = TRUE, perl = TRUE)),
+ a_df$country_list,
+ a_df$state
+ )
+
+
+ a_df$state <- ifelse((a_df$country == "usa" & grepl("[[:digit:]]", a_df$state)),
+ gsub("[[:digit:]]", "", a_df$state),
+ a_df$state
+ )
+ a_df$state <- ifelse((a_df$country == "usa" & grepl("usa", a_df$state)),
+ gsub("usa", "", a_df$state),
+ a_df$state
+ )
+ a_df$state <- trimws(a_df$state, which = "both")
+
+
+
+
+ # Remove panama canal zone from usa states (for stri)
+ a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "cz"),
+ NA,
+ a_df$state
+ )
+
+ # armed forces & diplomatic
+ a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "aa"),
+ NA,
+ a_df$state
+ )
+
+ a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "apo"),
+ NA,
+ a_df$city
+ )
+
+ a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "dpo"),
+ NA,
+ a_df$city
+ )
+
+
+ # Japanese prefectures & cities sometimes swapped in address --------------
+
+
+
+ to_delete <- c(
+ "&", "inst", "ctr", "med", "chem", "lab", "biol",
+ "dept", "div", "univ", "hosp", "coll", "sci", "rd",
+ "program", "minist", "educ", "sch ", "grad ", "fac ",
+ "assoc", "forest", "corp"
+ )
+ pattern <- paste(to_delete, collapse = "|")
+ a_df$city2 <- ifelse((a_df$country == "japan" &
+ grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)),
+ NA,
+ a_df$city2
+ )
+
+ # Remove any with numbers
+ a_df$city2 <- ifelse((a_df$country == "japan" &
+ grepl("[[:digit:]]", a_df$city2)),
+ NA,
+ a_df$city2
+ )
+
+ japan_prefectures <- c(
+ "hokkaido", "aomori", "iwate", "miyagi", "akita",
+ "yamagata", "fukushima", "ibaraki", "tochigi", "gunma",
+ "saitama", "chiba", "tokyo", "kanagawa", "niigata",
+ "toyama", "ishikawa", "fukui", "yamanashi", "nagano", "gifu",
+ "shizuoka", "aichi", "mie", "shiga", "kyoto", "osaka", "gumma",
+ "hyogo", "nara", "wakayama", "tottori", "shimane",
+ "okayama", "hiroshima", "yamaguchi", "tokushima", "kagawa",
+ "ehime", "kochi", "fukuoka", "saga", "nagasaki", "kumamoto",
+ "oita", "miyazaki", "kagoshima", "okinawa"
+ )
+ pattern <- paste(japan_prefectures, collapse = "|")
+
+
+ a_df$state <- ifelse((a_df$country == "japan" &
+ grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)),
+ a_df$city,
+ a_df$state
+ )
+
+
+ # This removes all special regions of a city like tokyo from city2
+ a_df$city2 <- ifelse((a_df$country == "japan" &
+ grepl(" ku", a_df$city2,
+ ignore.case = TRUE, perl = TRUE
+ )),
+ NA,
+ a_df$city2
+ )
+
+ # replace from city with city2 EXCEPT in cases where no state (and therefore
+ # city is correct) and where no city2 (otherwise would bring in NA)
+
+ a_df$city <- ifelse((a_df$country == "japan" &
+ !(is.na(a_df$state)) & !(is.na(a_df$city2))),
+ a_df$city2,
+ a_df$city
+ )
+
+
+
+ # fine-tuning SCOTLAND ----------------------------------------------------
+
+
+
+ a_df$city <- ifelse((a_df$country == "scotland" &
+ grepl("univ ", a_df$city, ignore.case = TRUE, perl = TRUE)),
+ gsub("univ ", "", a_df$city),
+ a_df$city
+ )
+
+ to_delete <- c(
+ " ave", " grp", "hlth", " rd", "mrc", " oba", "plz",
+ " dr", "oqb", " quad", "fisheries"
+ )
+
+ pattern <- paste(to_delete, collapse = "|")
+ a_df$city <- ifelse((a_df$country == "scotland" &
+ grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)),
+ NA,
+ a_df$city
+ )
+
+
+
+ # fine-tuning ENGLAND -----------------------------------------------------
+
+ message("\n(this is it - the last step!)\n")
+
+
+ to_delete <- c(
+ "&", "inst", "ctr", "med", "chem", "lab", "biol",
+ "dept", "div", "univ", "hosp", "coll", "sci", "rd",
+ "program", "minist", "educ", "sch ", "grad ", "fac ",
+ " sq", "quarter", " way", " dr", "diagnost", "consultant",
+ "microsoft", "diagnost", "[[:digit:]]", "project", "facil", "grp",
+ "campus", "expt", " pk", "canc", "assoc", "forest", "corp",
+ "consortium", "partners", "lane", "ucl", "street", "trust",
+ "business", "inform", "royal", "survey", "drosophila", " st",
+ "ndorms", "nat hist", "hlth", " ave", "council", "unit", "nerc", "nat res"
+ )
+ pattern <- paste(to_delete, collapse = "|")
+
+ a_df$city2 <- ifelse((a_df$country == "england" &
+ grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)),
+ NA,
+ a_df$city2
+ )
+
+ a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)),
+ a_df$city2,
+ a_df$city
+ )
+
+
+ a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) &
+ grepl("london", a_df$address, ignore.case = TRUE, perl = TRUE),
+ "london",
+ a_df$city
+ )
+
+ a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) &
+ grepl("cambridge", a_df$address, ignore.case = TRUE, perl = TRUE),
+ "cambridge", a_df$city
+ )
+
+ a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) &
+ grepl("oxford", a_df$address, ignore.case = TRUE, perl = TRUE),
+ "oxford",
+ a_df$city
+ )
+
+ a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) &
+ grepl("durham", a_df$address, ignore.case = TRUE, perl = TRUE),
+ "durham", a_df$city
+ )
+
+ a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) &
+ grepl("bristol", a_df$address, ignore.case = TRUE, perl = TRUE),
+ "bristol",
+ a_df$city
+ )
+
+ a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) &
+ grepl("lancaster", a_df$address, ignore.case = TRUE, perl = TRUE),
+ "lancaster",
+ a_df$city
+ )
+
+
+
+
+# final clean up to return ------------------------------------------------
+
+
+
+ # delete columns used to 2x check
+
+ a_df$city2 <- NULL
+ a_df$country_list <- NULL
+
+ # replace blank with NA
+
+ a_df[a_df == ""] <- NA
+
+
+ # return output of function -----------------------------------------------
+
+
return(a_df)
}
diff --git a/R/authors_clean.R b/R/authors_clean.R
index e68b9df..655036a 100644
--- a/R/authors_clean.R
+++ b/R/authors_clean.R
@@ -91,16 +91,6 @@ with the contents of column CA.', sep=" ")
final <- final[, c(cols,
colnames(final)[!colnames(final) %in% cols])]
- # sub_authors <- final %>%
- # filter(
- # groupID %in% final$groupID[!is.na(similarity) | flagged == 1]
- # ) %>%
- # select(authorID, AU, AF, groupID, match_name, matchID,
- # similarity, confidence, university, department,
- # postal_code, country, address, RP_address, RI,
- # OI, EM, UT, author_order, refID, PT, PY, PU) %>%
- # arrange(groupID, similarity, authorID)
- #
sub_authors <- subset(final,
groupID %in% groupID[!is.na(similarity) | flagged == 1],
select = c(
diff --git a/R/authors_georef.R b/R/authors_georef.R
index 574a70e..7156523 100644
--- a/R/authors_georef.R
+++ b/R/authors_georef.R
@@ -1,201 +1,385 @@
#' Extracts the lat and long for each address from authors_clean
#'
-#' \code{authors_georef} This function takes the final author list from
-#' refine_authors, and calculates the lat long of the addresses.
-#' It does this by feeding the addresses into data science toolkit.
-#' In order to maximize effectiveness and mitigate errors in parsing addresses
-#' We run this multiple times creating addresses in different ways
-#' in hopes that the google georeferencing API can recognize an address
-#' 1st. University, city, zipcode, country
-#' 2nd. City, zipcode, country
-#' 3rd. city, country
-#' 4th. University, country
-#'
-#' The output is a list with three data.frames
-#' \code{addresses} is a data frame with all information from
-#' refine_authors plus new location columns and calculated lat longs.
-#' \code{missing addresses} is a data frame with all addresses could
-#' not be geocoded
-#' \code{addresses} is a data frame like \code{addresses} except
-#' the missing addresses are gone.
+#' \code{authors_georef} This function takes the final author list from
+#' refine_authors, and calculates the lat long of the city, country, and postal
+#' code (for USA addresses) or city and country (for addresses outside the USA).
+#'
+#' The output is a list of three data.frames
+#' \code{addresses} All info from 'refine_authors' plus new columns with
+#' lat & long. It includes ALL addresses, including those that could not
+#' be geocoded.
+#' \code{missing_addresses} A data frame of the addresses that could
+#' NOT be geocoded.
+#' \code{no_missing_addresses} the \code{addresses} data frame with ONLY the
+#' addresses that were geocoded.
#'
#' @param data dataframe from `authors_refine()`
#' @param address_column name of column in quotes where the addresses are
+#' @param google_api if `google_api = FALSE` georeferencing is carried out with
+#' the `tidygeocoder` package (option `geocode()` with `method = 'osm'`).
+#' If `google_api = TRUE`, then geocoding is done with the Google Maps API.
+#' Defaults to `FALSE`.
#' @importFrom ggmap geocode
-#'
-#' @examples
-#'
+#'
+#' @examples
#' \dontrun{
-#' BITR_georef_df <- authors_georef(BITR_refined, address_column='address')
+#' BITR_georef_df <- authors_georef(BITR_refined, address_column = "address",
+#' google_api=FALSE)
#' }
-#' @export authors_georef
-#'
-authors_georef <- function(data,
- address_column = "address") {
-
- options(ggmap = list(display_api_key = FALSE))
- if (!is.character(data$address)) {
- stop("Address columns are not characters,
+#' @export authors_georef
+#'
+authors_georef <- function(
+ data,
+ address_column = "address",
+ google_api = FALSE) {
+ if (google_api == TRUE) {
+ pt1 <- ("Attention: You have chosen to geocode with the GOOGLE API.\n")
+ pt2 <- ("The number of free API calls in one month is limited.\n")
+ pt3 <- ("If the number of addresses being georeferenced exceeds \n")
+ pt4 <- ("this limit, Google WILL bill you for the difference.\n")
+ pt5 <- ("Please refer to Google's current billing rates & usage limits.\n")
+
+ message(paste(pt1, pt2, pt3, pt4, pt5, sep = ""))
+ rm(pt1, pt2, pt3, pt4, pt5)
+
+
+ options(ggmap = list(display_api_key = FALSE))
+
+ if (!is.character(data$address)) {
+ stop("Address columns are not characters,
please change to characters and try again")
- }
- addresses <- data[, c("university", "city", "state", "country",
- "postal_code", "authorID", "address")]
- #Change some formatting to help data science toolkit
- addresses$university[is.na(addresses$university)] <- ""
- addresses$country[is.na(addresses$country)] <- ""
- addresses$postal_code[is.na(addresses$postal_code)] <- ""
- addresses$city[is.na(addresses$city)] <- ""
- addresses$state[is.na(addresses$state)] <- ""
- addresses$country <- trimws(addresses$country, which = "both")
- addresses$city <- trimws(addresses$city, which = "both")
- addresses$state <- trimws(addresses$state, which = "both")
- addresses$university <- trimws(addresses$university, which = "both")
-
- # create Short form address base to defaul address
- # rougly adheres to universty, city, zipcode, country
- addresses$base <- addresses$country
- addresses$base[addresses$postal_code != ""] <-
- paste0(addresses$base[addresses$postal_code != ""],
- ", ",
- addresses$postal_code[addresses$postal_code != ""])
-
- addresses$base[addresses$state != ""] <-
- paste0(addresses$state[addresses$state != ""],
- ", ",
- addresses$country[addresses$state != ""])
-
- # second tier, city > zip > university
- addresses$second <- NA
- addresses$second[addresses$city != ""] <- addresses$city[addresses$city != ""]
- addresses$second[is.na(addresses$second) & addresses$university != ""] <-
- addresses$university[is.na(addresses$second) & addresses$university != ""]
-
- addresses$short_address <- addresses$base
- addresses$short_address[!is.na(addresses$second)] <-
- paste0(addresses$second[!is.na(addresses$second)],
- ", ",
- addresses$short_address[!is.na(addresses$second)])
- addresses$lat <- NA
- addresses$lon <- NA
- addresses$adID <- seq_len(nrow(addresses))
-
- # # we'll check if data science toolkit is working, by pinging a known address
- check_ad <- "1600 Pennsylvania Ave NW, Washington, DC 20500"
- check.open <- sum(is.na(ggmap::geocode(check_ad, source = "google", urlonly = TRUE))) == 0
- if (!check.open) {
- stop("google geocoding API is down right now, please try again later")
- }
+ }
+ addresses <- data[, c(
+ "university", "city", "state", "country",
+ "postal_code", "authorID", "address"
+ )]
+ # Change some formatting to help data science toolkit
+ addresses$university[is.na(addresses$university)] <- ""
+ addresses$country[is.na(addresses$country)] <- ""
+ addresses$postal_code[is.na(addresses$postal_code)] <- ""
+ addresses$city[is.na(addresses$city)] <- ""
+ addresses$state[is.na(addresses$state)] <- ""
+ addresses$country <- trimws(addresses$country, which = "both")
+ addresses$city <- trimws(addresses$city, which = "both")
+ addresses$state <- trimws(addresses$state, which = "both")
+ addresses$university <- trimws(addresses$university, which = "both")
- #Lets try broad strokes first. Our 4 layered address
-
- ggmap::register_google(
- key = ggmap::google_key(),
- write = TRUE,
- second_limit = 50,
- day_limit = 2500
- )
-
- for (i in addresses$adID[addresses$short_address != ""]) {
-
- address <- as.character(addresses$short_address[i])
- #if (address == '') next
- message(paste("Working... ", address))
-
- suppressWarnings(result <- ggmap::geocode(address,
- output = "latlona",
- source = "google",
- messaging = TRUE
+ # create Short form address base to defaul address
+ # rougly adheres to universty, city, zipcode, country
+ addresses$base <- addresses$country
+ addresses$base[addresses$postal_code != ""] <-
+ paste0(
+ addresses$base[addresses$postal_code != ""],
+ ", ",
+ addresses$postal_code[addresses$postal_code != ""]
+ )
+
+ addresses$base[addresses$state != ""] <-
+ paste0(
+ addresses$state[addresses$state != ""],
+ ", ",
+ addresses$country[addresses$state != ""]
+ )
+
+ # second tier, city > zip > university
+ addresses$second <- NA
+ addresses$second[addresses$city != ""] <- addresses$city[addresses$city != ""]
+ addresses$second[is.na(addresses$second) & addresses$university != ""] <-
+ addresses$university[is.na(addresses$second) & addresses$university != ""]
+
+ addresses$short_address <- addresses$base
+ addresses$short_address[!is.na(addresses$second)] <-
+ paste0(
+ addresses$second[!is.na(addresses$second)],
+ ", ",
+ addresses$short_address[!is.na(addresses$second)]
+ )
+ addresses$lat <- NA
+ addresses$lon <- NA
+ addresses$adID <- seq_len(nrow(addresses))
+
+ # # we'll check if data science toolkit is working, by pinging a known address
+ check_ad <- "1600 Pennsylvania Ave NW, Washington, DC 20500"
+ check.open <- sum(is.na(ggmap::geocode(check_ad, source = "google", urlonly = TRUE))) == 0
+ if (!check.open) {
+ stop("google geocoding API is down right now, please try again later")
+ }
+
+ # Lets try broad strokes first. Our 4 layered address
+
+ ggmap::register_google(
+ key = ggmap::google_key(),
+ write = TRUE,
+ second_limit = 50,
+ day_limit = 2500
+ )
+
+ for (i in addresses$adID[addresses$short_address != ""]) {
+ address <- as.character(addresses$short_address[i])
+ # if (address == '') next
+ message(paste("Working... ", address))
+
+ suppressWarnings(result <- ggmap::geocode(address,
+ output = "latlona",
+ source = "google",
+ messaging = TRUE
))
- addresses$lat[addresses$adID == i] <- result[[2]]
- addresses$lon[addresses$adID == i] <- result[[1]]
- }
+ addresses$lat[addresses$adID == i] <- result[[2]]
+ addresses$lon[addresses$adID == i] <- result[[1]]
+ }
- # Now lets try using a shorter code (city, state, country)
- remain <- addresses[is.na(addresses$lat), ]
- remain$short_address <-
- ifelse(!(is.na(remain$state) | is.na(remain$country)),
- paste0(remain$city, ", ", remain$state, ", ", remain$country),
- NA)
- remain <- remain[!is.na(remain$short_address) &
+ # Now lets try using a shorter code (city, state, country)
+ remain <- addresses[is.na(addresses$lat), ]
+ remain$short_address <-
+ ifelse(!(is.na(remain$state) | is.na(remain$country)),
+ paste0(remain$city, ", ", remain$state, ", ", remain$country),
+ NA
+ )
+ remain <- remain[!is.na(remain$short_address) &
remain$short_address != ", , ", ]
- for (i in remain$adID) {
- address <- as.character(remain$short_address[remain$adID == i])
- message(paste("Working... ", address))
- suppressWarnings(result <- ggmap::geocode(address,
- output = "latlona",
- source = "google",
- messaging = TRUE
- ))
- addresses$lat[addresses$adID == i] <- result[[2]]
- addresses$lon[addresses$adID == i] <- result[[1]]
- }
+ for (i in remain$adID) {
+ address <- as.character(remain$short_address[remain$adID == i])
+ message(paste("Working... ", address))
+ suppressWarnings(result <- ggmap::geocode(address,
+ output = "latlona",
+ source = "google",
+ messaging = TRUE
+ ))
+ addresses$lat[addresses$adID == i] <- result[[2]]
+ addresses$lon[addresses$adID == i] <- result[[1]]
+ }
- # Now try city, country
- remain <- addresses[is.na(addresses$lat), ]
- remain$short_address <-
- ifelse(!(is.na(remain$city) | is.na(remain$country)),
- paste0(remain$city, ", ", remain$country),
- NA)
+ # Now try city, country
+ remain <- addresses[is.na(addresses$lat), ]
+ remain$short_address <-
+ ifelse(!(is.na(remain$city) | is.na(remain$country)),
+ paste0(remain$city, ", ", remain$country),
+ NA
+ )
- remain <- remain[!is.na(remain$short_address) &
+ remain <- remain[!is.na(remain$short_address) &
remain$short_address != ", ", ]
- for (i in remain$adID) {
- address <- as.character(remain$short_address[remain$adID == i])
- message(paste("Working... ", address))
- suppressWarnings(result <- ggmap::geocode(address,
- output = "latlona",
- source = "google",
- messaging = TRUE
- ))
- addresses$lat[addresses$adID == i] <- result[[2]]
- addresses$lon[addresses$adID == i] <- result[[1]]
- }
+ for (i in remain$adID) {
+ address <- as.character(remain$short_address[remain$adID == i])
+ message(paste("Working... ", address))
+ suppressWarnings(result <- ggmap::geocode(address,
+ output = "latlona",
+ source = "google",
+ messaging = TRUE
+ ))
+ addresses$lat[addresses$adID == i] <- result[[2]]
+ addresses$lon[addresses$adID == i] <- result[[1]]
+ }
- # Finally try using just university, country
- remain <- addresses[is.na(addresses$lat), ]
- remain$short_address <-
- ifelse(!(is.na(remain$university) | is.na(remain$country)),
- paste0(remain$university, ", ", remain$country),
- NA)
+ # Finally try using just university, country
+ remain <- addresses[is.na(addresses$lat), ]
+ remain$short_address <-
+ ifelse(!(is.na(remain$university) | is.na(remain$country)),
+ paste0(remain$university, ", ", remain$country),
+ NA
+ )
- remain <- remain[!is.na(remain$short_address) &
+ remain <- remain[!is.na(remain$short_address) &
remain$short_address != ", ", ]
- for (i in remain$adID) {
- address <- as.character(remain$short_address[remain$adID == i])
- message(paste("Working... ", address))
- suppressWarnings(result <- ggmap::geocode(address,
- output = "latlona",
- source = "google",
- messaging = TRUE
+ for (i in remain$adID) {
+ address <- as.character(remain$short_address[remain$adID == i])
+ message(paste("Working... ", address))
+ suppressWarnings(result <- ggmap::geocode(address,
+ output = "latlona",
+ source = "google",
+ messaging = TRUE
+ ))
+ addresses$lat[addresses$adID == i] <- result[[2]]
+ addresses$lon[addresses$adID == i] <- result[[1]]
+ }
+
+ ## Change "" back to NA
+ addresses$country[addresses$country == ""] <- NA
+ addresses$university[addresses$university == ""] <- NA
+ addresses$postal_code[addresses$postal_code == ""] <- NA
+
+ addresses <-
+ merge(
+ addresses[, c(
+ "authorID", "lat", "lon"
+ )],
+ data[, c(
+ "authorID", "groupID", "author_order", "address",
+ "department", "RP_address", "RI", "OI", "UT", "refID"
+ )],
+ by = "authorID", all.y = TRUE
+ )
+
+ missingaddresses <- addresses[is.na(addresses$lat), ]
+ addresses$lat <- unlist(addresses$lat)
+ addresses$lon <- unlist(addresses$lon)
+
+ outputlist <- list()
+ outputlist$addresses <- addresses
+ outputlist$missing_addresses <- missingaddresses
+ outputlist$no_missing_addresses <- addresses[!is.na(addresses$lat), ]
+
+ # reset ggmaps option to TRUE. This only until the ggmaps gets fixed
+ on.exit(options(ggmap = list(display_api_key = TRUE)))
+ return(outputlist)
+ }
+
+ if (google_api != TRUE) {
+
+ requireNamespace(package = "tidygeocoder", quietly = TRUE)
+
+ pt1 <- ("You are Geocoding with OpenStreetMap.\n")
+ pt2 <- ("This proceeds at a rate of 1 address/second.\n")
+ pt3 <- ("For large data sets: OSM requests that you consider downloading\n")
+ pt4 <- ("the complete database to query locally instead of using the API.\n")
+ pt5 <- ("See the Refsplitr vignette for more information.\n")
+ message(paste(pt1, pt2, pt3, pt4, pt5, sep = ""))
+ rm(pt1, pt2, pt3, pt4, pt5)
+
+
+
+ if (!is.character(data$address)) {
+ stop("Address columns are not characters,
+ please change to characters and try again")
+ }
+ # a_df <- data[, c(
+ # "university", "city", "state", "country",
+ # "postal_code", "authorID", "address"
+ # )]
+
+ a_df <- data[, c(
+ "city", "state", "country",
+ "postal_code", "authorID"
+ )]
+ a_df$country[a_df$country == "could not be extracted"] <- NA
+ a_df$state[a_df$state == "no state"] <- NA
+ a_df <- a_df[!is.na(a_df$country), ]
+ # select the following columns from the fll dataframe
+ # a_df<-("authorID", "city","state","postal_code","country")
+ a_df$addr <- NA
+
+
+ a_df$addr <- ifelse(a_df$country == "usa",
+ ifelse(!is.na(a_df$state),
+ ifelse(!is.na(a_df$postal_code),
+ paste(a_df$city,
+ a_df$state,
+ a_df$postal_code,
+ a_df$country,
+ sep = ","
+ ),
+ paste(a_df$city,
+ a_df$state,
+ a_df$country,
+ sep = ","
+ )
+ ),
+ ifelse(!is.na(a_df$postal_code),
+ paste(a_df$city,
+ a_df$postal_code,
+ a_df$country,
+ sep = ","
+ ),
+ paste(a_df$city,
+ a_df$country,
+ sep = ","
+ )
+ )
+ ),
+ paste(a_df$city,
+ a_df$country,
+ sep = ","
+ )
+ )
+
+
+ #
+ # a_df$addr <- ifelse(is.na(a_df$state),
+ # paste(a_df$city, a_df$country, sep = ","),
+ # paste(a_df$city, a_df$state, a_df$country, sep = ",")
+ # )
+
+ a_df$addr <- ifelse(a_df$country == "Could not be extracted",
+ NA,
+ a_df$addr
+ )
+ to_georef_df <- a_df$addr
+
+ # Find unique values of the 'id' column and keep all other columns
+
+
+ to_georef_df <- unique(a_df$addr)
+ to_georef_df <- as.data.frame(to_georef_df)
+ colnames(to_georef_df) <- c("addr")
+
+ # to_georef_df <- na.omit(to_georef_df)
+
+ message(paste("Number of locations being geocoded: ",
+ nrow(to_georef_df), sep = ""))
+
+
+
+ to_georef_df <- to_georef_df |> tidygeocoder::geocode(addr,
+ method = "osm",
+ lat = latitude, long = longitude
+ )
+ no_latlon <- to_georef_df[is.na(to_georef_df$latitude), ]
+ perc_missing <- (nrow(no_latlon) / nrow(to_georef_df)) * 100
+
+ pt1 <- c(paste("Unable to georef ",
+ round(perc_missing, 2), "% of author addresses.\n",
+ sep = ""
))
- addresses$lat[addresses$adID == i] <- result[[2]]
- addresses$lon[addresses$adID == i] <- result[[1]]
- }
+ pt2 <- c("Check `outputlist$missing_addresses` to see which ones.\n")
+ message(paste(pt1, pt2, sep = ""))
+ rm(pt1, pt2, perc_missing)
- ## Change "" back to NA
- addresses$country[addresses$country == ""] <- NA
- addresses$university[addresses$university == ""] <- NA
- addresses$postal_code[addresses$postal_code == ""] <- NA
-
- addresses <-
- merge(
- addresses[, c("authorID", "university", "postal_code",
- "country", "lat", "lon")],
- data[, c("authorID", "groupID", "author_order", "address",
- "department", "RP_address", "RI", "OI", "UT", "refID")],
- by = "authorID", all.y = TRUE)
-
- missingaddresses <- addresses[is.na(addresses$lat), ]
- addresses$lat <- unlist(addresses$lat)
- addresses$lon <- unlist(addresses$lon)
-
- outputlist <- list()
- outputlist$addresses <- addresses
- outputlist$missing_addresses <- missingaddresses
- outputlist$not_missing_addresses <- addresses[!is.na(addresses$lat), ]
-
- # reset ggmaps option to TRUE. This only until the ggmaps gets fixed
- on.exit(options(ggmap = list(display_api_key = TRUE)))
- return(outputlist)
-}
\ No newline at end of file
+ # These get merged back into the original
+ a_df <-
+ merge(
+ to_georef_df[, c(
+ "addr", "latitude", "longitude"
+ )],
+ a_df,
+ by = "addr", all.y = TRUE
+ )
+
+ data <-
+ merge(
+ a_df[, c(
+ "authorID", "latitude", "longitude"
+ )],
+ data,
+ by = c("authorID"), all.y = TRUE
+ )
+
+ names(data)[names(data) == "latitude"] <- "lat"
+ names(data)[names(data) == "longitude"] <- "lon"
+
+
+ no_georef <- data[is.na(data$lat), ]
+
+ addresses <- data
+ missingaddresses <- data[is.na(data$lat), ]
+ addresses$lat <- unlist(data$lat)
+ addresses$lon <- unlist(data$lon)
+
+ outputlist <- list()
+ outputlist$addresses <- addresses
+ outputlist$missing_addresses <- missingaddresses
+ outputlist$no_missing_addresses <- addresses[!is.na(addresses$lat), ]
+ pt1 <- ("The output is a list with three data.frames:\n")
+ pt2 <- ("outputlist$addresses: all info from 'refine_authors'
+ plus new `lat` & `long` columns. It includes ALL addresses,
+ including those that could not be geocoded. \n")
+ pt3 <- ("outputlist$missing_addresses: Includes only the addresses that
+ could NOT be geocoded.\n")
+ pt4 <- ("outputlist$no_missing_addresses: Includes only the addresses
+ that WERE geocoded. \n")
+ message(paste(pt1, pt2, pt3, pt4, sep = ""))
+ rm(pt1, pt2, pt3, pt4)
+
+ return(outputlist)
+ }
+}
diff --git a/R/authors_match.R b/R/authors_match.R
index 988a77a..63a7731 100644
--- a/R/authors_match.R
+++ b/R/authors_match.R
@@ -139,12 +139,7 @@ authors_match <- function(data){
if (any(matched_df$merged[matched_df$groupID == q])) next
sub <- matched_df[matched_df$groupID == q, ]
- # common_df <- matched_df %>%
- # dplyr::filter(
- # squash %in% sub$squash &
- # ( (f_c %in% 1) | (f_c > 1 & first %in% sub$first) ) &
- # groupID != q
- # )
+
common_df <- subset(matched_df,
squash %in% sub$squash &
( (f_c %in% 1) | (f_c > 1 & first %in% sub$first) ) &
diff --git a/R/plot_net_address.R b/R/plot_net_address.R
index 691bd9c..58b8405 100644
--- a/R/plot_net_address.R
+++ b/R/plot_net_address.R
@@ -193,7 +193,10 @@ plot_net_address <- function(data,
}
## Create the world outlines:
world_map@data$id <- rownames(world_map@data)
- world_map.points <- ggplot2::fortify(world_map)
+
+
+ # world_map.points <- ggplot2::fortify(world_map) # deprecated
+ world_map.points <- sf_convert(world_map)
world_map.df <- merge(world_map.points,
world_map@data, by = "id", all = TRUE)
world_map.df <- world_map.df[!is.na(world_map.df$lat), ]
diff --git a/R/plot_net_country.R b/R/plot_net_country.R
index d619631..85ddba7 100644
--- a/R/plot_net_country.R
+++ b/R/plot_net_country.R
@@ -45,60 +45,51 @@ plot_net_country <- function(data,
mapRegion = "world",
lineAlpha = 0.5) {
- requireNamespace(package = "dplyr", quietly = TRUE)
- requireNamespace(package = "magrittr", quietly = TRUE)
+
- fixable_countries<-data %>%
- dplyr::filter(is.na(country)==FALSE & is.na(lat)==TRUE) %>%
- dplyr::select(refID,country) %>%
- dplyr::group_by(refID,country) %>%
- dplyr::tally() %>%
+ fixable_countries<-data |>
+ dplyr::filter(is.na(country)==FALSE & is.na(lat)==TRUE) |>
+ dplyr::select(refID,country) |>
+ dplyr::group_by(refID,country) |>
+ dplyr::tally() |>
dplyr::arrange(n)
data <- data[!is.na(data$country), ]
- # names in WOS often don't match those in rworldmap'
- data<-data %>%
- dplyr::mutate(country=dplyr::case_when(
- country == "usa" ~ "united states of america",
- country == "united states" ~ "united states of america",
- country == "serbia" ~ "republic of serbia",
- country == "peoples r china" ~ "china",
- country == "uk" ~ "united kingdom",
- country == "england" ~ "united kingdom",
- country == "scotland" ~ "united kingdom",
- country == "wales" ~ "united kingdom",
- country == "north ireland" ~ "united kingdom",
- country == "cent afr republ" ~ "central african republic",
- country == "cote ivoire" ~ "ivory coast",
- country == "papua n guinea" ~ "papua new guinea",
- country == "sao tome & prin" ~ "sao tome and principe",
- country == "tanzania" ~ "united republic of tanzania",
- country == "rep congo" ~ "republic of the congo",
- country == "bahamas" ~ "the bahamas",
- country == "dem rep congo" ~ "republic of the congo",
- country == "rep congo" ~ "democratic republic of the congo",
- country == "democratic republic of congo" ~ "democratic republic of the congo",
- country == "fr polynesia" ~ "french polynesia",
- country == "surinam" ~ "suriname",
- country == "turks & caicos" ~ "turks and caicos islands",
- country == "u arab emirates" ~ "united arab emirates",
- # country == "curaçao" ~ "curacao",
- country == "cura\u00e7ao" ~ "curacao", # to avoid fail for non-ascii characters
- country == "libyan arab jamahiriya" ~ "libya",
- country == "rhodesia" ~ "zimbabwe",
- country == "russian federation" ~ "russia",
- country == "hong kong" ~ "hong kong sar",
- country == "hong kong s.a.r." ~ "hong kong sar",
- country == "brunei darussalam" ~ "brunei",
- country == "trinidade and tobago" ~ "trinidad and tobago",
- .default = as.character(country)
- ))
-
- # are there any without lat/lon but WITH country?
- # data %>% filter(is.na(lat)==TRUE) %>% distinct(country)
+ data$country[data$country=="usa"] <- "united states of america"
+ data$country[data$country=="united states"] <- "united states of america"
+ data$country[data$country=="serbia"] <- "republic of serbia"
+ data$country[data$country=="peoples r china"] <- "china"
+ data$country[data$country=="uk"] <- "united kingdom"
+ data$country[data$country=="england"] <- "united kingdom"
+ data$country[data$country=="scotland"] <- "united kingdom"
+ data$country[data$country=="wales"] <- "united kingdom"
+ data$country[data$country=="north ireland"] <- "united kingdom"
+ data$country[data$country=="cent afr republ"] <- "central african republic"
+ data$country[data$country=="cote ivoire"] <- "ivory coast"
+ data$country[data$country=="papua n guinea"] <- "papua new guinea"
+ data$country[data$country=="sao tome & prin"] <- "sao tome and principe"
+ data$country[data$country=="tanzania"] <- "united republic of tanzania"
+ data$country[data$country=="rep congo"] <- "republic of the congo"
+ data$country[data$country=="bahamas"] <- "the bahamas"
+ data$country[data$country=="dem rep congo"] <- "republic of the congo"
+ data$country[data$country=="rep congo"] <- "democratic republic of the congo"
+ data$country[data$country=="democratic republic of congo"] <- "democratic republic of the congo"
+ data$country[data$country=="fr polynesia"] <- "french polynesia"
+ data$country[data$country=="surinam"] <- "suriname"
+ data$country[data$country=="turks & caicos"] <- "turks and caicos islands"
+ data$country[data$country=="u arab emirates"] <- "united arab emirates"
+ data$country[data$country=="libyan arab jamahiriya"] <- "libya"
+ data$country[data$country=="rhodesia"] <- "zimbabwe"
+ data$country[data$country=="russian federation"] <- "russia"
+ data$country[data$country=="hong kong"] <- "hong kong sar"
+ data$country[data$country=="hong kong s.a.r."] <- "hong kong sar"
+ data$country[data$country=="brunei darussalam"] <- "brunei"
+ data$country[data$country=="trinidade and tobago"] <- "trinidad and tobago"
+ # to avoid fail for non-ascii characters
+ data$country[data$country=="cura\u00e7ao"] <- "curacao"
## we could use a sparse matrix representation:
linkages <- Matrix::spMatrix(
@@ -106,6 +97,8 @@ plot_net_country <- function(data,
ncol = length(unique(data$UT)),
i = as.numeric(factor(data$country)),
j = as.numeric(factor(data$UT)),
+
+
x = rep(1, length(data$country))
)
@@ -177,21 +170,44 @@ plot_net_country <- function(data,
# coords_df %>% filter(is.na(LAT)==TRUE) %>% distinct(ISO_A2)
# need to add them manually
- coords_df<- coords_df %>%
- dplyr::mutate(LAT=dplyr::case_when(
- ISO_A2 == "french guiana" ~ 3.9339,
- ISO_A2 == "bonaire" ~ 12.2019,
- ISO_A2 == "reunion" ~ -68.2624,
- ISO_A2 == "palestine" ~ 31.9522,
- .default = as.numeric(LAT)
- )) %>%
- dplyr::mutate(LON=dplyr::case_when(
- ISO_A2 == "french guiana" ~ -53.1258,
- ISO_A2 == "bonaire" ~ -68.2624,
- ISO_A2 == "reunion" ~ 55.5364,
- ISO_A2 == "palestine" ~ 35.2332,
- .default = as.numeric(LON)
- ))
+ # LAT
+
+
+ coords_df$LAT <- ifelse(coords_df$ISO_A2 == "french guiana",
+ 3.9339,
+ coords_df$LAT)
+
+ coords_df$LAT <- ifelse(coords_df$ISO_A2 == "bonaire",
+ 12.2019,
+ coords_df$LAT)
+
+ coords_df$LAT <- ifelse(coords_df$ISO_A2 == "reunion",
+ -68.2624,
+ coords_df$LAT)
+
+ coords_df$LAT <- ifelse(coords_df$ISO_A2 == "palestine",
+ 31.9522,
+ coords_df$LAT)
+
+
+ # LON
+
+ coords_df$LON <- ifelse(coords_df$ISO_A2 == "french guiana",
+ -53.1258,
+ coords_df$LON)
+
+ coords_df$LON <- ifelse(coords_df$ISO_A2 == "bonaire",
+ -68.2624,
+ coords_df$LON)
+
+ coords_df$LON <- ifelse(coords_df$ISO_A2 == "reunion",
+ 55.5364,
+ coords_df$LON)
+
+ coords_df$LON <- ifelse(coords_df$ISO_A2 == "palestine",
+ 35.2332,
+ coords_df$LON)
+
## One could also use ggplot to plot out the network geographically:
@@ -209,7 +225,7 @@ plot_net_country <- function(data,
layoutCoordinates <- stats::na.omit(layoutCoordinates)
- adjacencyList<- adjacencyList %>%
+ adjacencyList<- adjacencyList |>
dplyr::mutate(country=dplyr::case_when(
country == "V1" ~ NA,
.default = as.character(country)
@@ -217,7 +233,7 @@ plot_net_country <- function(data,
- adjacencyList<- adjacencyList %>%
+ adjacencyList<- adjacencyList |>
dplyr::mutate(countryA=dplyr::case_when(
countryA == "V1" ~ NA,
.default = as.character(countryA)
@@ -397,7 +413,7 @@ plot_net_country <- function(data,
ggplot2::geom_path(
data = allEdges,
ggplot2::aes(x = !!x, y = !!y, group = !!Group,
- colour = !!Sequence, size = !!Sequence), alpha = lineAlpha
+ colour = !!Sequence, linewidth = !!Sequence), alpha = lineAlpha
) +
ggplot2::geom_point(
data = data.frame(layoutCoordinates), # Add nodes
diff --git a/R/references_read.R b/R/references_read.R
index 9154a8b..44f8412 100644
--- a/R/references_read.R
+++ b/R/references_read.R
@@ -13,7 +13,7 @@
#' @param include_all if FALSE only a subset of commonly used fields from references records are imported.
#' If TRUE then all fields from the reference records are imported. Defaults to FALSE.
#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, FX, GA, J9,
-#' LA, PA, PI, PN, PS, RID, SU, VR.
+#' LA, PA, PI, PN, PS, RID, SU, VR, OA.
#' @export references_read
#'
#' @examples
diff --git a/README.Rmd b/README.Rmd
index 02a9c4b..ed79b38 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -35,7 +35,7 @@ There are four steps in the `refsplitr` package's workflow:
1. Importing and tidying Web of Science reference records (be sure to download records using the procedure in Appendix 1 of the [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html))
2. Author name disambiguation and parsing of author addresses
-3. Georeferencing of author institutions. (*Important Note*: Google has changed its API requirements, which means users now have register with Google prior to georeferencing. For additional details see the [`ggmap`](https://github.com/dkahle/ggmap) repository and the instructions below.)
+3. Georeferencing of author institutions using either the [Nominatim](https://nominatim.org/) service, which uses OpenStreetMap data and which `refsplitr` queries via the [`tidygeocoder`]((https://jessecambon.github.io/tidygeocoder/) package (default; free) _OR_ the [Data Science Toolkit](http://www.datasciencetoolkit.org/), which uses the Google maps API (limited number of free queries after which users must pay); for additional details on pricing information how to register with Google to use their API see the `refsplitr` [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html).
4. Data visualization
The procedures required for these four steps,each of which is implemented with a simple command, are described in detail in the `refsplitr` [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html). An example of this workflow is provided below:
@@ -58,29 +58,6 @@ dat4 <- authors_georef(dat3)
plot_net_address(dat4$addresses)
```
-## Registering with Google for an API key
-
-1. Install and load the `ggmap` package
-
-```{r example2, eval=FALSE}
-
-install.packages("ggmap")
-library(ggmap)
-
-```
-
-1. Register for a Google [Geocoding API](https://developers.google.com/maps/documentation/geocoding/overview) by following the instructions on the `READ ME` of the [`ggmap`](https://github.com/dkahle/ggmap) repository.
-
-2. Once you have your API key, add it to your `~/.Renviron` with the following:
-
-```{r example3, eval=FALSE}
-`ggmap::register_google(key = "[your key]", write = TRUE)`
-```
-
-3. You should now be able to use `authors_georef()` as described in the vignette. **WARNING:** `refsplitr` currently has a limit of 2500 API calls per day. We are working on including the ability for users to select their own limits.
-
-***Remember***: Your API key is unique and for you alone. Don't share it with other users or record it in a script file that is saved in a public repository. If need be you can visit the same website where you initially registered and generate a new key.
-
## Improvements & Suggestions
We welcome any suggestions for package improvement or ideas for features to include in future versions. We welcome any suggestions for package improvement or ideas for features to include in future versions. If you have suggestions, [here is how to contribute](https://github.com/ropensci/refsplitr/blob/master/CONTRIBUTING.md). We expect everyone contributing to the package to abide by our [Code of Conduct](https://github.com/ropensci/refsplitr/blob/master/CODE_OF_CONDUCT.md).
diff --git a/man/BITR.Rd b/man/BITR.Rd
index 275ed53..ae1e13f 100644
--- a/man/BITR.Rd
+++ b/man/BITR.Rd
@@ -3,7 +3,7 @@
\docType{data}
\name{BITR}
\alias{BITR}
-\title{Data from the journal BioTropica (pulled from Web of Knowledge)}
+\title{Data from the journal Biotropica (pulled from Web of Knowledge)}
\format{
A data frame with 10 rows and 32 variables:
\describe{
@@ -47,7 +47,7 @@ The remaining codes are described on the Web of Knowledge website:
BITR
}
\description{
-A dataset containing 10 articles taken from the BioTropica journal.
+A dataset containing 10 articles taken from the journal Biotropica.
This dataset represents the typical formatted output from \code{references_read()}
in the refsplitr package. It serves as a testbed for commonly miscategorized names
}
diff --git a/man/BITR_geocode.Rd b/man/BITR_geocode.Rd
index ef960ba..1351a7e 100644
--- a/man/BITR_geocode.Rd
+++ b/man/BITR_geocode.Rd
@@ -3,7 +3,7 @@
\docType{data}
\name{BITR_geocode}
\alias{BITR_geocode}
-\title{Georeferenced data from the journal BioTropica (pulled from Web of Science)}
+\title{Georeferenced data from the journal Biotropica (pulled from Web of Science)}
\format{
A data frame with 41 rows and 15 variables:
\describe{
@@ -15,8 +15,8 @@ institution for non-universities}
\item{lat}{numeric, latitude populated from authors_georef}
\item{lon}{numeric, longitude populated from authors_georef}
\item{groupID}{ID field for what name group the author
-is identied as from authors_clean()}
-\item{author_order}{numeric, order of author from jounral article}
+is identified as from authors_clean()}
+\item{author_order}{numeric, order of author from journal article}
\item{address}{address of references pulled from
the original raw WOS file}
\item{department}{department which is nested within university}
@@ -34,7 +34,7 @@ given by references_read()}
BITR_geocode
}
\description{
-A dataset containing 41 authors taken from the BioTropica journal.
+A dataset containing 41 authors taken from the Biotropica journal.
This dataset represents the typical formatted output
from \code{authors_georef()}
in the refsplitr package. It serves as a useful testing data set for
diff --git a/man/authors_georef.Rd b/man/authors_georef.Rd
index 01546c6..0debb64 100644
--- a/man/authors_georef.Rd
+++ b/man/authors_georef.Rd
@@ -4,37 +4,36 @@
\alias{authors_georef}
\title{Extracts the lat and long for each address from authors_clean}
\usage{
-authors_georef(data, address_column = "address")
+authors_georef(data, address_column = "address", google_api = FALSE)
}
\arguments{
\item{data}{dataframe from \code{authors_refine()}}
\item{address_column}{name of column in quotes where the addresses are}
+
+\item{google_api}{if \code{google_api = FALSE} georeferencing is carried out with
+the \code{tidygeocoder} package (option \code{geocode()} with \code{method = 'osm'}).
+If \code{google_api = TRUE}, then geocoding is done with the Google Maps API.
+Defaults to \code{FALSE}.}
}
\description{
\code{authors_georef} This function takes the final author list from
-refine_authors, and calculates the lat long of the addresses.
-It does this by feeding the addresses into data science toolkit.
-In order to maximize effectiveness and mitigate errors in parsing addresses
-We run this multiple times creating addresses in different ways
-in hopes that the google georeferencing API can recognize an address
-1st. University, city, zipcode, country
-2nd. City, zipcode, country
-3rd. city, country
-4th. University, country
+refine_authors, and calculates the lat long of the city, country, and postal
+code (for USA addresses) or city and country (for addresses outside the USA).
}
\details{
-The output is a list with three data.frames
-\code{addresses} is a data frame with all information from
-refine_authors plus new location columns and calculated lat longs.
-\code{missing addresses} is a data frame with all addresses could
-not be geocoded
-\code{addresses} is a data frame like \code{addresses} except
-the missing addresses are gone.
+The output is a list of three data.frames
+\code{addresses} All info from 'refine_authors' plus new columns with
+lat & long. It includes ALL addresses, including those that could not
+be geocoded.
+\code{missing_addresses} A data frame of the addresses that could
+NOT be geocoded.
+\code{no_missing_addresses} the \code{addresses} data frame with ONLY the
+addresses that were geocoded.
}
\examples{
-
\dontrun{
-BITR_georef_df <- authors_georef(BITR_refined, address_column='address')
+BITR_georef_df <- authors_georef(BITR_refined, address_column = "address",
+google_api=FALSE)
}
}
diff --git a/man/references_read.Rd b/man/references_read.Rd
index aba14e6..2c43b01 100644
--- a/man/references_read.Rd
+++ b/man/references_read.Rd
@@ -9,7 +9,7 @@ references_read(data = ".", dir = FALSE, include_all = FALSE)
\arguments{
\item{data}{the location of the file or files to be imported. This can be either the absolute or
relative name of the file (for a single file) or folder (for multiple files stored in the same folder;
-used in conjuction with `dir = TRUE``). If left blank it is assumed the location is the working directory.}
+used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory.}
\item{dir}{if FALSE it is assumed a single file is to be imported.
Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``;
@@ -17,25 +17,24 @@ all files in the folder will be imported). Defaults to FALSE.}
\item{include_all}{if FALSE only a subset of commonly used fields from references records are imported.
If TRUE then all fields from the reference records are imported. Defaults to FALSE.
-The additional data fields included if \code{include_all=TRUE}: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI,
-LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR.}
+The additional data fields included if \code{include_all=TRUE}: CC, CH, CL, CT, CY, FX, GA, J9,
+LA, PA, PI, PN, PS, RID, SU, VR, OA.}
}
\description{
\code{references_read} This function reads Thomson Reuters Web of Knowledge
and ISI format reference data files into an R-friendly data format. The resulting dataframe
-is the argument for the refplitr function \code{authors_clean()}.
+is the argument for the refsplitr function \code{authors_clean()}.
}
\examples{
-## If a single files is being imported from a folder called "data" located in an RStudio Project:
+## If a single files is being imported from a folder called "data" located in an RStudio Project:
## imported_refs<-references_read(data = './data/refs.txt', dir = FALSE, include_all=FALSE)
## If multiple files are being imported from a folder named "heliconia" nested within a folder
-## called "data" located in an RStudio Project:
+## called "data" located in an RStudio Project:
## heliconia_refs<-references_read(data = './data/heliconia', dir = TRUE, include_all=FALSE)
-## To load the Web of Science records used in the examples in the documentation
-BITR_data_example <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr')
+## To load the Web of Science records used in the examples in the documentation
+BITR_data_example <- system.file("extdata", "BITR_test.txt", package = "refsplitr")
BITR <- references_read(BITR_data_example)
-
}
diff --git a/man/refsplitr-package.Rd b/man/refsplitr-package.Rd
index 495c191..f8b4e96 100644
--- a/man/refsplitr-package.Rd
+++ b/man/refsplitr-package.Rd
@@ -6,7 +6,7 @@
\alias{refsplitr-package}
\title{refsplitr: author name disambiguation, author georeferencing, and mapping of coauthorship networks with 'Web of Science' data}
\description{
-\if{html}{\figure{logo.png}{options: align='right' alt='logo' width='120'}}
+\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}}
Tools to parse and organize reference records downloaded from the 'Web of Science' citation database into an R-friendly format, disambiguate the names of authors, geocode their locations, and generate/visualize coauthorship networks. This package has been peer-reviewed by rOpenSci (v. 1.0).
}
diff --git a/tests/extdata/BadHeader.txt b/tests/extdata/BadHeader.txt
index 37762bd..ce212a2 100644
--- a/tests/extdata/BadHeader.txt
+++ b/tests/extdata/BadHeader.txt
@@ -1,4 +1,4 @@
-XX Thomson Reuters Web of Science™
+XX Thomson Reuters Web of Science™
VR 1.0
PT J
AU Sobrinho, MS
@@ -345,4 +345,4 @@ BP 1129
EP 1129
UT WOS:000341179800029
PM 25190784
-ER
+ER
\ No newline at end of file
diff --git a/tests/extdata/PubExample.txt b/tests/extdata/PubExample.txt
index b410da1..6761896 100644
--- a/tests/extdata/PubExample.txt
+++ b/tests/extdata/PubExample.txt
@@ -1,4 +1,4 @@
-FN Thomson Reuters Web of Science™
+FN Thomson Reuters Web of Science™
VR 1.0
PT J
AU Sobrinho, MS
@@ -345,4 +345,4 @@ BP 1129
EP 1129
UT WOS:000341179800029
PM 25190784
-ER
+ER
\ No newline at end of file
diff --git a/tests/testthat/test_authors_address.R b/tests/testthat/test_authors_address.R
index d0793c0..4fdbb4a 100644
--- a/tests/testthat/test_authors_address.R
+++ b/tests/testthat/test_authors_address.R
@@ -16,10 +16,10 @@ address=c("Univ Sydney, Fac Vet Sci, Sch Life & Environm Sci,
actual<-authors_address(df$address, df$authorID)
expect_false(any(is.na(actual$country)))
expect_false(grepl('BR',actual$postal_code[actual$country=='brazil']))
-expect_equal(sum(grepl("[a-z]{1}[0-9]{1}[a-z]{1}\\s[0-9]{1}[a-z]{1}[0-9]{1}",
+expect_equal(sum(grepl("[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{1}",
actual$postal_code)),2)
expect_equal(unique(actual$country[grepl(
- "[a-z]{1}[0-9]{1}[a-z]{1}\\s[0-9]{1}[a-z]{1}[0-9]{1}",
+ "[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{1}",
actual$postal_code)]),'canada')
expect_equal(c(actual$country[8], actual$state[8], actual$postal_code[8]),
c('usa','fl','33312'))
diff --git a/vignettes/refsplitr.Rmd b/vignettes/refsplitr.Rmd
index 16c68f7..60444f1 100644
--- a/vignettes/refsplitr.Rmd
+++ b/vignettes/refsplitr.Rmd
@@ -1,7 +1,7 @@
---
title: "refsplitr"
author: "Auriel M. V. Fournier, Matthew E. Boone, Forrest R. Stevens, Emilio M. Bruna"
-date: "2022-02-04"
+date: "2025-03-24"
output:
rmarkdown::html_vignette:
fig_width: 6
@@ -43,7 +43,7 @@ The `refsplitr` package can either import a single Web of Science search result
- **dir**: when loading a single file dir=FALSE, when loading multiple files dir=TRUE. If multiple files are processed `refsplitr` will identify and remove any duplicate reference records.
-- **include_all**: Setting 'include_all=TRUE' will import all fields from the WOS record (see Appendix 2). The defailt is 'include_all=FALSE'.
+- **include_all**: Setting 'include_all=TRUE' will import all fields from the WOS record (see Appendix 2). The default is 'include_all=FALSE'.
The output of `references_read()` is an object in the R workspace. Each line of the output is a reference; the columns are the name of the .txt file from which the data were extracted, a unique id number assigned by `refsplitr` to each article, and the data from each field of the reference record (see Appendix 2 for a list of these data fields and their [Web of Science](https://images.webofknowledge.com/images/help/WOS/hs_wos_fieldtags.html) and RIS codes). This object is used by `refsplitr` in Step 2.2; we recommend also saving it as a .csv file in the "output" folder.
@@ -52,8 +52,8 @@ The output of `references_read()` is an object in the R workspace. Each line of
a. To import and process a single file, set dir=FALSE and set data equal to the file path. For example, if the file "example_data.txt" were saved in the "data" folder of the RStudio project, you would import and process the data file as follows:
-
-```r
+
+``` r
example_refs <- references_read(data = "./data/example_data.txt",
dir=FALSE,
include_all = FALSE)
@@ -63,7 +63,7 @@ example_refs <- references_read(data = "./data/example_data.txt",
b. To import and process multiple files, set "dir = TRUE" and use "data=" to indicate the folder containing the files. For instance, if the files were saved in a folder called "UF_data" inside the "data" folder of the RStudio project, they would be imported and processed as follows:
-```r
+``` r
example_refs <- references_read(data = "./data/UF_data",
dir=TRUE,
include_all = FALSE)
@@ -73,7 +73,7 @@ example_refs <- references_read(data = "./data/UF_data",
c. The sample data used in the examples below can be loaded and processed as follows:
-```r
+``` r
example_refs <- references_read(data = system.file("extdata",package = "refsplitr"),
dir = TRUE,
include_all = FALSE)
@@ -82,11 +82,14 @@ example_refs <- references_read(data = system.file("extdata",package = "refsplit
d. The processed references can then be saved as a .csv file in the "output" folder of the RStudio project:
-```r
+``` r
write.csv(example_refs,"./output/example_refs.csv")
```
-
+
+plot of chunk unnamed-chunk-5
+
+
+plot of chunk unnamed-chunk-8
+
+
+plot of chunk unnamed-chunk-12
+
+
+plot of chunk unnamed-chunk-13
+
+
+plot of chunk unnamed-chunk-14
+
+
+plot of chunk unnamed-chunk-15
+plot of chunk unnamed-chunk-19
+
+
+plot of chunk unnamed-chunk-30
+
+plot of chunk unnamed-chunk-30
+
+
+plot of chunk unnamed-chunk-33
+
+
+plot of chunk unnamed-chunk-34
+
+
+plot of chunk unnamed-chunk-35
+
+
+plot of chunk unnamed-chunk-36
+
+
+plot of chunk unnamed-chunk-37
+
+
+plot of chunk unnamed-chunk-38
+
+
+plot of chunk unnamed-chunk-39
+
+
+plot of chunk unnamed-chunk-40
+