From f9695a4cda98eeaa5354d2769c66df73be8443dd Mon Sep 17 00:00:00 2001 From: embruna Date: Thu, 16 Jan 2025 17:40:02 -0500 Subject: [PATCH 01/34] tidygeocider now default for geocoding --- R/authors_georef.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/authors_georef.R b/R/authors_georef.R index 574a70e..f68bebb 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -21,6 +21,9 @@ #' #' @param data dataframe from `authors_refine()` #' @param address_column name of column in quotes where the addresses are +#' #' @param google_api if FALSE georeferencing is carried out with the +#' tidygeocoder package (geocode() with method = 'osm'). if TRUE geocoding +#' is done with the google maps API. Defaults to FALSE. #' @importFrom ggmap geocode #' #' @examples From 8ac11406997cb0e96823aec4147c27b483d93a44 Mon Sep 17 00:00:00 2001 From: embruna Date: Thu, 16 Jan 2025 17:44:08 -0500 Subject: [PATCH 02/34] tidygeoicoder new dfault --- R/authors_georef.R | 403 ++++++++++++++++++++++++++++----------------- 1 file changed, 249 insertions(+), 154 deletions(-) diff --git a/R/authors_georef.R b/R/authors_georef.R index f68bebb..f8d629b 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -1,8 +1,8 @@ #' Extracts the lat and long for each address from authors_clean #' -#' \code{authors_georef} This function takes the final author list from -#' refine_authors, and calculates the lat long of the addresses. -#' It does this by feeding the addresses into data science toolkit. +#' \code{authors_georef} This function takes the final author list from +#' refine_authors, and calculates the lat long of the addresses. +#' It does this by feeding the addresses into data science toolkit. #' In order to maximize effectiveness and mitigate errors in parsing addresses #' We run this multiple times creating addresses in different ways #' in hopes that the google georeferencing API can recognize an address @@ -10,9 +10,9 @@ #' 2nd. City, zipcode, country #' 3rd. city, country #' 4th. University, country -#' +#' #' The output is a list with three data.frames -#' \code{addresses} is a data frame with all information from +#' \code{addresses} is a data frame with all information from #' refine_authors plus new location columns and calculated lat longs. #' \code{missing addresses} is a data frame with all addresses could #' not be geocoded @@ -21,173 +21,270 @@ #' #' @param data dataframe from `authors_refine()` #' @param address_column name of column in quotes where the addresses are -#' #' @param google_api if FALSE georeferencing is carried out with the -#' tidygeocoder package (geocode() with method = 'osm'). if TRUE geocoding +#' #' @param google_api if FALSE georeferencing is carried out with the +#' tidygeocoder package (geocode() with method = 'osm'). if TRUE geocoding #' is done with the google maps API. Defaults to FALSE. #' @importFrom ggmap geocode -#' -#' @examples -#' +#' +#' @examples #' \dontrun{ -#' BITR_georef_df <- authors_georef(BITR_refined, address_column='address') +#' BITR_georef_df <- authors_georef(BITR_refined, address_column = "address") #' } -#' @export authors_georef -#' -authors_georef <- function(data, - address_column = "address") { - - options(ggmap = list(display_api_key = FALSE)) - if (!is.character(data$address)) { - stop("Address columns are not characters, +#' @export authors_georef +#' + +if (google_api == TRUE) { + pt1 <- ("Attention: You have chosen to geocode with the GOOGLE API.\n") + pt2 <- ("This is NOT a free service.\n") + pt3 <- ("Please refer to Google's current billing rates & usage limits.\n") + + message(paste(pt1, pt2, pt3, sep = "")) + rm(pt1, pt2, pt3) + + authors_georef <- function( + data, + address_column = "address") { + options(ggmap = list(display_api_key = FALSE)) + if (!is.character(data$address)) { + stop("Address columns are not characters, please change to characters and try again") - } - addresses <- data[, c("university", "city", "state", "country", - "postal_code", "authorID", "address")] - #Change some formatting to help data science toolkit - addresses$university[is.na(addresses$university)] <- "" - addresses$country[is.na(addresses$country)] <- "" - addresses$postal_code[is.na(addresses$postal_code)] <- "" - addresses$city[is.na(addresses$city)] <- "" - addresses$state[is.na(addresses$state)] <- "" - addresses$country <- trimws(addresses$country, which = "both") - addresses$city <- trimws(addresses$city, which = "both") - addresses$state <- trimws(addresses$state, which = "both") - addresses$university <- trimws(addresses$university, which = "both") - - # create Short form address base to defaul address - # rougly adheres to universty, city, zipcode, country - addresses$base <- addresses$country - addresses$base[addresses$postal_code != ""] <- - paste0(addresses$base[addresses$postal_code != ""], - ", ", - addresses$postal_code[addresses$postal_code != ""]) - - addresses$base[addresses$state != ""] <- - paste0(addresses$state[addresses$state != ""], - ", ", - addresses$country[addresses$state != ""]) - - # second tier, city > zip > university - addresses$second <- NA - addresses$second[addresses$city != ""] <- addresses$city[addresses$city != ""] - addresses$second[is.na(addresses$second) & addresses$university != ""] <- - addresses$university[is.na(addresses$second) & addresses$university != ""] - - addresses$short_address <- addresses$base - addresses$short_address[!is.na(addresses$second)] <- - paste0(addresses$second[!is.na(addresses$second)], - ", ", - addresses$short_address[!is.na(addresses$second)]) - addresses$lat <- NA - addresses$lon <- NA - addresses$adID <- seq_len(nrow(addresses)) - - # # we'll check if data science toolkit is working, by pinging a known address - check_ad <- "1600 Pennsylvania Ave NW, Washington, DC 20500" - check.open <- sum(is.na(ggmap::geocode(check_ad, source = "google", urlonly = TRUE))) == 0 - if (!check.open) { - stop("google geocoding API is down right now, please try again later") - } + } + addresses <- data[, c( + "university", "city", "state", "country", + "postal_code", "authorID", "address" + )] + # Change some formatting to help data science toolkit + addresses$university[is.na(addresses$university)] <- "" + addresses$country[is.na(addresses$country)] <- "" + addresses$postal_code[is.na(addresses$postal_code)] <- "" + addresses$city[is.na(addresses$city)] <- "" + addresses$state[is.na(addresses$state)] <- "" + addresses$country <- trimws(addresses$country, which = "both") + addresses$city <- trimws(addresses$city, which = "both") + addresses$state <- trimws(addresses$state, which = "both") + addresses$university <- trimws(addresses$university, which = "both") - #Lets try broad strokes first. Our 4 layered address - - ggmap::register_google( - key = ggmap::google_key(), - write = TRUE, - second_limit = 50, - day_limit = 2500 - ) - - for (i in addresses$adID[addresses$short_address != ""]) { - - address <- as.character(addresses$short_address[i]) - #if (address == '') next - message(paste("Working... ", address)) - - suppressWarnings(result <- ggmap::geocode(address, - output = "latlona", - source = "google", - messaging = TRUE + # create Short form address base to defaul address + # rougly adheres to universty, city, zipcode, country + addresses$base <- addresses$country + addresses$base[addresses$postal_code != ""] <- + paste0( + addresses$base[addresses$postal_code != ""], + ", ", + addresses$postal_code[addresses$postal_code != ""] + ) + + addresses$base[addresses$state != ""] <- + paste0( + addresses$state[addresses$state != ""], + ", ", + addresses$country[addresses$state != ""] + ) + + # second tier, city > zip > university + addresses$second <- NA + addresses$second[addresses$city != ""] <- addresses$city[addresses$city != ""] + addresses$second[is.na(addresses$second) & addresses$university != ""] <- + addresses$university[is.na(addresses$second) & addresses$university != ""] + + addresses$short_address <- addresses$base + addresses$short_address[!is.na(addresses$second)] <- + paste0( + addresses$second[!is.na(addresses$second)], + ", ", + addresses$short_address[!is.na(addresses$second)] + ) + addresses$lat <- NA + addresses$lon <- NA + addresses$adID <- seq_len(nrow(addresses)) + + # # we'll check if data science toolkit is working, by pinging a known address + check_ad <- "1600 Pennsylvania Ave NW, Washington, DC 20500" + check.open <- sum(is.na(ggmap::geocode(check_ad, source = "google", urlonly = TRUE))) == 0 + if (!check.open) { + stop("google geocoding API is down right now, please try again later") + } + + # Lets try broad strokes first. Our 4 layered address + + ggmap::register_google( + key = ggmap::google_key(), + write = TRUE, + second_limit = 50, + day_limit = 2500 + ) + + for (i in addresses$adID[addresses$short_address != ""]) { + address <- as.character(addresses$short_address[i]) + # if (address == '') next + message(paste("Working... ", address)) + + suppressWarnings(result <- ggmap::geocode(address, + output = "latlona", + source = "google", + messaging = TRUE )) - addresses$lat[addresses$adID == i] <- result[[2]] - addresses$lon[addresses$adID == i] <- result[[1]] - } + addresses$lat[addresses$adID == i] <- result[[2]] + addresses$lon[addresses$adID == i] <- result[[1]] + } - # Now lets try using a shorter code (city, state, country) - remain <- addresses[is.na(addresses$lat), ] - remain$short_address <- - ifelse(!(is.na(remain$state) | is.na(remain$country)), - paste0(remain$city, ", ", remain$state, ", ", remain$country), - NA) - remain <- remain[!is.na(remain$short_address) & + # Now lets try using a shorter code (city, state, country) + remain <- addresses[is.na(addresses$lat), ] + remain$short_address <- + ifelse(!(is.na(remain$state) | is.na(remain$country)), + paste0(remain$city, ", ", remain$state, ", ", remain$country), + NA + ) + remain <- remain[!is.na(remain$short_address) & remain$short_address != ", , ", ] - for (i in remain$adID) { - address <- as.character(remain$short_address[remain$adID == i]) - message(paste("Working... ", address)) - suppressWarnings(result <- ggmap::geocode(address, - output = "latlona", - source = "google", - messaging = TRUE - )) - addresses$lat[addresses$adID == i] <- result[[2]] - addresses$lon[addresses$adID == i] <- result[[1]] - } + for (i in remain$adID) { + address <- as.character(remain$short_address[remain$adID == i]) + message(paste("Working... ", address)) + suppressWarnings(result <- ggmap::geocode(address, + output = "latlona", + source = "google", + messaging = TRUE + )) + addresses$lat[addresses$adID == i] <- result[[2]] + addresses$lon[addresses$adID == i] <- result[[1]] + } - # Now try city, country - remain <- addresses[is.na(addresses$lat), ] - remain$short_address <- - ifelse(!(is.na(remain$city) | is.na(remain$country)), - paste0(remain$city, ", ", remain$country), - NA) + # Now try city, country + remain <- addresses[is.na(addresses$lat), ] + remain$short_address <- + ifelse(!(is.na(remain$city) | is.na(remain$country)), + paste0(remain$city, ", ", remain$country), + NA + ) - remain <- remain[!is.na(remain$short_address) & + remain <- remain[!is.na(remain$short_address) & remain$short_address != ", ", ] - for (i in remain$adID) { - address <- as.character(remain$short_address[remain$adID == i]) - message(paste("Working... ", address)) - suppressWarnings(result <- ggmap::geocode(address, - output = "latlona", - source = "google", - messaging = TRUE - )) - addresses$lat[addresses$adID == i] <- result[[2]] - addresses$lon[addresses$adID == i] <- result[[1]] - } + for (i in remain$adID) { + address <- as.character(remain$short_address[remain$adID == i]) + message(paste("Working... ", address)) + suppressWarnings(result <- ggmap::geocode(address, + output = "latlona", + source = "google", + messaging = TRUE + )) + addresses$lat[addresses$adID == i] <- result[[2]] + addresses$lon[addresses$adID == i] <- result[[1]] + } - # Finally try using just university, country - remain <- addresses[is.na(addresses$lat), ] - remain$short_address <- - ifelse(!(is.na(remain$university) | is.na(remain$country)), - paste0(remain$university, ", ", remain$country), - NA) + # Finally try using just university, country + remain <- addresses[is.na(addresses$lat), ] + remain$short_address <- + ifelse(!(is.na(remain$university) | is.na(remain$country)), + paste0(remain$university, ", ", remain$country), + NA + ) - remain <- remain[!is.na(remain$short_address) & + remain <- remain[!is.na(remain$short_address) & remain$short_address != ", ", ] - for (i in remain$adID) { - address <- as.character(remain$short_address[remain$adID == i]) - message(paste("Working... ", address)) - suppressWarnings(result <- ggmap::geocode(address, - output = "latlona", - source = "google", - messaging = TRUE - )) - addresses$lat[addresses$adID == i] <- result[[2]] - addresses$lon[addresses$adID == i] <- result[[1]] + for (i in remain$adID) { + address <- as.character(remain$short_address[remain$adID == i]) + message(paste("Working... ", address)) + suppressWarnings(result <- ggmap::geocode(address, + output = "latlona", + source = "google", + messaging = TRUE + )) + addresses$lat[addresses$adID == i] <- result[[2]] + addresses$lon[addresses$adID == i] <- result[[1]] + } + + ## Change "" back to NA + addresses$country[addresses$country == ""] <- NA + addresses$university[addresses$university == ""] <- NA + addresses$postal_code[addresses$postal_code == ""] <- NA + + addresses <- + merge( + addresses[, c( + "authorID", "university", "postal_code", + "country", "lat", "lon" + )], + data[, c( + "authorID", "groupID", "author_order", "address", + "department", "RP_address", "RI", "OI", "UT", "refID" + )], + by = "authorID", all.y = TRUE + ) + + missingaddresses <- addresses[is.na(addresses$lat), ] + addresses$lat <- unlist(addresses$lat) + addresses$lon <- unlist(addresses$lon) + + outputlist <- list() + outputlist$addresses <- addresses + outputlist$missing_addresses <- missingaddresses + outputlist$not_missing_addresses <- addresses[!is.na(addresses$lat), ] + + # reset ggmaps option to TRUE. This only until the ggmaps gets fixed + on.exit(options(ggmap = list(display_api_key = TRUE))) + return(outputlist) } +} else { + pt1 <- ("You are Geocoding with OpenStreetMap.\n") + pt2 <- ("This proceeds at a rate of 1 address/second.\n") + pt3 <- ("For large data sets: OSM requests that you consider downloading\n") + pt4 <- ("the complete database to query locally instead of using the API.\n") + pt5 <- ("See the Refsplitr vignette for more information.\n") + message(paste(pt1, pt2, pt3, pt4, pt5, sep = "")) + rm(pt1, pt2, pt3, pt4, pt5) + + # select the following columns from the fll dataframe + # a_df<-("authorID", "city","state","postal_code","country") + a_df$addr <- NA + a_df$addr <- ifelse(is.na(a_df$state), + paste(a_df$city, a_df$country, sep = ","), + paste(a_df$city, a_df$state, a_df$country, sep = ",") + ) + + a_df$addr <- ifelse(a_df$country == "Could not be extracted", + NA, + a_df$addr + ) - ## Change "" back to NA - addresses$country[addresses$country == ""] <- NA - addresses$university[addresses$university == ""] <- NA - addresses$postal_code[addresses$postal_code == ""] <- NA + to_georef_df <- unique(a_df$addr) + to_georef_df <- as.data.frame(to_georef_df) + colnames(to_georef_df) <- c("addr") + to_georef_df <- na.omit(to_georef_df) + # library(tidygeocoder) + to_georef_df <- to_georef_df |> tidygeocoder::geocode(addr, + method = "osm", + lat = latitude, long = longitude + ) + + + no_georef <- to_georef_df[is.na(to_georef_df$latitude), ] + + + + perc_missing <- (nrow(no_georef) / nrow(to_georef_df)) * 100 + print(paste("lat/longs are missing for ", round(perc_missing, 2), "% of the locations.", sep = "")) + print("check `outputlist$missing_addresses` to see which they are") + # no_georef + + + + # These get merged back into the original + # TODO: MAKE SURE THIS IS CORRECT!!!! addresses <- merge( - addresses[, c("authorID", "university", "postal_code", - "country", "lat", "lon")], - data[, c("authorID", "groupID", "author_order", "address", - "department", "RP_address", "RI", "OI", "UT", "refID")], - by = "authorID", all.y = TRUE) + to_georef_df[, c( + "authorID", "city", "state", "postal_code", + "country", "lat", "lon" + )], + a_df[, c( + "authorID", "groupID", "author_order", "address", "university", + "department", "RP_address", "RI", "OI", "UT", "refID" + )], + by = "authorID", all.y = TRUE + ) + missingaddresses <- addresses[is.na(addresses$lat), ] addresses$lat <- unlist(addresses$lat) @@ -198,7 +295,5 @@ authors_georef <- function(data, outputlist$missing_addresses <- missingaddresses outputlist$not_missing_addresses <- addresses[!is.na(addresses$lat), ] - # reset ggmaps option to TRUE. This only until the ggmaps gets fixed - on.exit(options(ggmap = list(display_api_key = TRUE))) return(outputlist) -} \ No newline at end of file +} From 2eb48eae0ccf02ca17774671d748a7f345f8489a Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 17 Jan 2025 08:33:34 -0500 Subject: [PATCH 03/34] added tidygeocoder to description --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b4393de..4bf0f72 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -49,7 +49,8 @@ Imports: network, stringdist, rworldmap, - sna + sna, + tidygeocoder Suggests: covr, gdtools, From 5b4d91774087b8529b3d437526a90bd399087ea5 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 17 Jan 2025 10:25:54 -0500 Subject: [PATCH 04/34] updated authors_georef to use tidydeocoder as default --- R/authors_georef.R | 137 +++++++++++++++++++++++++++++++-------------- 1 file changed, 94 insertions(+), 43 deletions(-) diff --git a/R/authors_georef.R b/R/authors_georef.R index f8d629b..726e3c9 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -12,12 +12,13 @@ #' 4th. University, country #' #' The output is a list with three data.frames -#' \code{addresses} is a data frame with all information from -#' refine_authors plus new location columns and calculated lat longs. -#' \code{missing addresses} is a data frame with all addresses could -#' not be geocoded -#' \code{addresses} is a data frame like \code{addresses} except -#' the missing addresses are gone. +#' \code{addresses} All info from 'refine_authors' plus new columns with +#' lat & long. It includes ALL addresses, including those +#' that could not be geocoded. +#' \code{missing_addresses} A data frame of the addresses that could +#' NOT be geocoded. +#' \code{no_missing_addresses} the \code{addresses} data frame with ONLY the +#' addresses that were geocoded. #' #' @param data dataframe from `authors_refine()` #' @param address_column name of column in quotes where the addresses are @@ -32,7 +33,11 @@ #' } #' @export authors_georef #' - +authors_georef <- function( + data, + address_column = "address", + google_api) { + if (google_api == TRUE) { pt1 <- ("Attention: You have chosen to geocode with the GOOGLE API.\n") pt2 <- ("This is NOT a free service.\n") @@ -41,10 +46,9 @@ if (google_api == TRUE) { message(paste(pt1, pt2, pt3, sep = "")) rm(pt1, pt2, pt3) - authors_georef <- function( - data, - address_column = "address") { + options(ggmap = list(display_api_key = FALSE)) + if (!is.character(data$address)) { stop("Address columns are not characters, please change to characters and try again") @@ -202,8 +206,7 @@ if (google_api == TRUE) { addresses <- merge( addresses[, c( - "authorID", "university", "postal_code", - "country", "lat", "lon" + "authorID", "lat", "lon" )], data[, c( "authorID", "groupID", "author_order", "address", @@ -219,12 +222,12 @@ if (google_api == TRUE) { outputlist <- list() outputlist$addresses <- addresses outputlist$missing_addresses <- missingaddresses - outputlist$not_missing_addresses <- addresses[!is.na(addresses$lat), ] + outputlist$no_missing_addresses <- addresses[!is.na(addresses$lat), ] # reset ggmaps option to TRUE. This only until the ggmaps gets fixed on.exit(options(ggmap = list(display_api_key = TRUE))) return(outputlist) - } + } else { pt1 <- ("You are Geocoding with OpenStreetMap.\n") pt2 <- ("This proceeds at a rate of 1 address/second.\n") @@ -234,24 +237,50 @@ if (google_api == TRUE) { message(paste(pt1, pt2, pt3, pt4, pt5, sep = "")) rm(pt1, pt2, pt3, pt4, pt5) + + + if (!is.character(data$address)) { + stop("Address columns are not characters, + please change to characters and try again") + } + # a_df <- data[, c( + # "university", "city", "state", "country", + # "postal_code", "authorID", "address" + # )] + + a_df <- data[, c( + "city", "state", "country", + "postal_code", "authorID" + )] + + a_df<-a_df[!is.na(a_df$country),] # select the following columns from the fll dataframe # a_df<-("authorID", "city","state","postal_code","country") a_df$addr <- NA - a_df$addr <- ifelse(is.na(a_df$state), - paste(a_df$city, a_df$country, sep = ","), - paste(a_df$city, a_df$state, a_df$country, sep = ",") - ) + + a_df$addr <- ifelse(a_df$country == "usa" & !is.na(a_df$state), + paste(a_df$city, a_df$state, a_df$country, sep = ","), + paste(a_df$city, a_df$country, sep = ",")) + # + # a_df$addr <- ifelse(is.na(a_df$state), + # paste(a_df$city, a_df$country, sep = ","), + # paste(a_df$city, a_df$state, a_df$country, sep = ",") + # ) a_df$addr <- ifelse(a_df$country == "Could not be extracted", NA, a_df$addr ) - + to_georef_df <- a_df$addr + # Find unique values of the 'id' column and keep all other columns + + to_georef_df <- unique(a_df$addr) to_georef_df <- as.data.frame(to_georef_df) colnames(to_georef_df) <- c("addr") - to_georef_df <- na.omit(to_georef_df) + # to_georef_df <- na.omit(to_georef_df) + # library(tidygeocoder) to_georef_df <- to_georef_df |> tidygeocoder::geocode(addr, method = "osm", @@ -259,41 +288,63 @@ if (google_api == TRUE) { ) - no_georef <- to_georef_df[is.na(to_georef_df$latitude), ] - - - - perc_missing <- (nrow(no_georef) / nrow(to_georef_df)) * 100 - print(paste("lat/longs are missing for ", round(perc_missing, 2), "% of the locations.", sep = "")) - print("check `outputlist$missing_addresses` to see which they are") - # no_georef - - - # These get merged back into the original # TODO: MAKE SURE THIS IS CORRECT!!!! - addresses <- + a_df <- merge( to_georef_df[, c( - "authorID", "city", "state", "postal_code", - "country", "lat", "lon" + "addr","latitude", "longitude" )], + a_df, + by = "addr", all.y = TRUE + ) + + data <- + merge( a_df[, c( - "authorID", "groupID", "author_order", "address", "university", - "department", "RP_address", "RI", "OI", "UT", "refID" + "authorID","latitude","longitude" )], - by = "authorID", all.y = TRUE + data, + by = c("authorID"), all.y = TRUE ) - - - missingaddresses <- addresses[is.na(addresses$lat), ] - addresses$lat <- unlist(addresses$lat) - addresses$lon <- unlist(addresses$lon) + + names(data)[names(data) == 'latitude'] <- 'lat' + names(data)[names(data) == 'longitude'] <- 'lon' + + + no_georef <- data[is.na(data$lat), ] + + addresses<-data + missingaddresses <- data[is.na(data$lat), ] + addresses$lat <- unlist(data$lat) + addresses$lon <- unlist(data$lon) outputlist <- list() outputlist$addresses <- addresses outputlist$missing_addresses <- missingaddresses - outputlist$not_missing_addresses <- addresses[!is.na(addresses$lat), ] + outputlist$no_missing_addresses <- addresses[!is.na(addresses$lat), ] return(outputlist) + + perc_missing <- (nrow(missingaddresses) / nrow(addresses)) * 100 + pt1<-c(paste("lat/longs are missing for ", + round(perc_missing, 2), "% of the author addresses.\n", sep = "")) + pt2<- c("Check `outputlist$missing_addresses` to see which.\n") + message(paste(pt1, pt2, sep = "")) + rm(pt1,pt2,perc_missing) + + + + pt1 <- ("The output is a list with three data.frames:\n") + pt2 <- ("output$addresses: all info from 'refine_authors' + plus new columns with lat & long. It includes ALL addresses, + including those that could not be geocoded. \n") + pt3 <- ("output$missing_addresses: a data frame of the addresses that + could NOT be geocoded.\n") + pt4 <- ("output$no_missing_addresses: a data frame with ONLY the addresses + that were geocoded. \n") + message(paste(pt1, pt2, pt3, pt4,sep = "")) + rm(pt1, pt2, pt3, pt4) + } +} \ No newline at end of file From b3076f9729452a7a3445384bc65e5510e1344bd4 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 17 Jan 2025 10:51:47 -0500 Subject: [PATCH 05/34] added zip code to addr that gets georeferenced --- R/authors_georef.R | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/R/authors_georef.R b/R/authors_georef.R index 726e3c9..e4870fa 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -258,9 +258,18 @@ if (google_api == TRUE) { # a_df<-("authorID", "city","state","postal_code","country") a_df$addr <- NA - a_df$addr <- ifelse(a_df$country == "usa" & !is.na(a_df$state), - paste(a_df$city, a_df$state, a_df$country, sep = ","), - paste(a_df$city, a_df$country, sep = ",")) + +a_df$addr <- ifelse(a_df$country == "usa", + ifelse(!is.na(a_df$state), + ifelse(!is.na(a_df$postal_code), + paste(a_df$city, a_df$state, a_df$postal_code, a_df$country, sep = ","), + paste(a_df$city, a_df$state, a_df$country, sep = ",")), + ifelse(!is.na(a_df$postal_code), + paste(a_df$city, a_df$postal_code, a_df$country, sep = ","), + paste(a_df$city, a_df$country, sep = ","))), + paste(a_df$city, a_df$country, sep = ",")) + + # # a_df$addr <- ifelse(is.na(a_df$state), # paste(a_df$city, a_df$country, sep = ","), From 48aaa5802fe52b448824a6aeb469d8ae1d104167 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 17 Jan 2025 11:01:18 -0500 Subject: [PATCH 06/34] corrected typo --- R/authors_georef.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/R/authors_georef.R b/R/authors_georef.R index e4870fa..569ba92 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -296,9 +296,7 @@ a_df$addr <- ifelse(a_df$country == "usa", lat = latitude, long = longitude ) - # These get merged back into the original - # TODO: MAKE SURE THIS IS CORRECT!!!! a_df <- merge( to_georef_df[, c( @@ -333,7 +331,7 @@ a_df$addr <- ifelse(a_df$country == "usa", outputlist$missing_addresses <- missingaddresses outputlist$no_missing_addresses <- addresses[!is.na(addresses$lat), ] - return(outputlist) + perc_missing <- (nrow(missingaddresses) / nrow(addresses)) * 100 pt1<-c(paste("lat/longs are missing for ", @@ -345,15 +343,16 @@ a_df$addr <- ifelse(a_df$country == "usa", pt1 <- ("The output is a list with three data.frames:\n") - pt2 <- ("output$addresses: all info from 'refine_authors' + pt2 <- ("outputlist$addresses: all info from 'refine_authors' plus new columns with lat & long. It includes ALL addresses, including those that could not be geocoded. \n") - pt3 <- ("output$missing_addresses: a data frame of the addresses that + pt3 <- ("outputlist$missing_addresses: a data frame of the addresses that could NOT be geocoded.\n") - pt4 <- ("output$no_missing_addresses: a data frame with ONLY the addresses + pt4 <- ("outputlist$no_missing_addresses: a data frame with ONLY the addresses that were geocoded. \n") message(paste(pt1, pt2, pt3, pt4,sep = "")) rm(pt1, pt2, pt3, pt4) + return(outputlist) } } \ No newline at end of file From 3bb27f093d0966806817cfc168002d6f1a03a903 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 17 Jan 2025 11:13:52 -0500 Subject: [PATCH 07/34] edited the note on how many couldn't be georef'd --- R/authors_georef.R | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/R/authors_georef.R b/R/authors_georef.R index 569ba92..1b31527 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -295,7 +295,15 @@ a_df$addr <- ifelse(a_df$country == "usa", method = "osm", lat = latitude, long = longitude ) - + no_latlon <- to_georef_df[is.na(to_georef_df$latitude), ] + perc_missing <- (nrow(no_latlon) / nrow(to_georef_df)) * 100 + + pt1<-c(paste("Unable to georef ", + round(perc_missing, 2), "% of author addresses.\n", sep = "")) + pt2<- c("Check `outputlist$missing_addresses` to see which ones.\n") + message(paste(pt1, pt2, sep = "")) + rm(pt1,pt2,perc_missing) + # These get merged back into the original a_df <- merge( @@ -332,13 +340,6 @@ a_df$addr <- ifelse(a_df$country == "usa", outputlist$no_missing_addresses <- addresses[!is.na(addresses$lat), ] - - perc_missing <- (nrow(missingaddresses) / nrow(addresses)) * 100 - pt1<-c(paste("lat/longs are missing for ", - round(perc_missing, 2), "% of the author addresses.\n", sep = "")) - pt2<- c("Check `outputlist$missing_addresses` to see which.\n") - message(paste(pt1, pt2, sep = "")) - rm(pt1,pt2,perc_missing) From b31444a0fe00c48707b3e91ef639b1a1d10218cd Mon Sep 17 00:00:00 2001 From: embruna Date: Tue, 21 Jan 2025 16:51:43 -0500 Subject: [PATCH 08/34] revising --- R/authors_address_update.R | 2074 ++++++++++++++++++++++++++++++++++++ 1 file changed, 2074 insertions(+) create mode 100644 R/authors_address_update.R diff --git a/R/authors_address_update.R b/R/authors_address_update.R new file mode 100644 index 0000000..9fbd3d5 --- /dev/null +++ b/R/authors_address_update.R @@ -0,0 +1,2074 @@ +#' Parses out address information and splits it into its respective parts. +#' This is an internal function used by \code{authors_clean} +#' +#' \code{authors_address} This function takes the output from +#' \code{references_read} and pulls out address information. Splitting it into +#' university, department, city, state, etc. +#' @param addresses the addresses +#' @param ID the authorID +#' @noRd +authors_address <- function(addresses, ID){ + + # DELETE + # library(tidyverse) + # library(refsplitr) + final<-read.csv("./data/wos_txt/final.csv") + addresses<-final$address + ID<-final$authorID + addresses<-tolower(addresses) + message("\nSplitting addresses\n") + list_address <- strsplit(addresses, ",") + + +# trim ws ----------------------------------------------------------------- + + list_address <- lapply(list_address, trimws) + + + + +# COUNTRIES --------------------------------------------------------------- + +# remove punctuation ---------------------------------------- + ## First remove periods and trim white space from countries. + ## helps avoids mistakes later on + remove_period_from_last <- function(list_address) { + lapply(list_address, function(x) { + if (length(x) > 0) { + x[length(x)] <- gsub("\\.$", "", x[length(x)]) + x[length(x)] <- trimws(x[length(x)], which = "both") + } + return(x) + }) + } + + list_address <- remove_period_from_last(list_address) + + +# correct names + # Define the function + correct_countries <- function(my_list, replacements) { + # Loop through each element of the list + for(i in 1:length(my_list)) { + # Get the length of the current element + len <- length(my_list[[i]]) + + # Check if the last item matches any of the target words + if(len > 0 && my_list[[i]][len] %in% names(replacements)) { + # Replace the last item with the corresponding replacement word + my_list[[i]][len] <- replacements[[my_list[[i]][len]]] + } + } + return(my_list) + } + # czechia = new name for czech republic + # TBD: united arab rep, + replacements <- c("austl" = "australia", + "c z" = "czechia", + "cz" = "czechia", + "czech republic" = "czechia", + "fed rep ger" = "germany", + "columbia" = "colombia", + "peoples r china" = "china", + "u arab emirates" = "united arab emirates", + "mongol peo rep" = "mongolia", + "dominican rep" = "dominican republic", + "fr polynesia" = "french polynesia", + "neth antilles" = "netherland antilles", + "trinid & tobago" = "trinidad & tobago", + "rep congo" = "congo", + "north ireland" = "northern ireland", + "syrian arab rep" = "syria" + ) + + + + list_address <- correct_countries(list_address, replacements) + + + +# Extract University ------------------------------------------------------ + + university_list <- vapply(list_address, function(x) x[1], character(1)) + + +# Extract the Department -------------------------------------------------- + + # If department is listed it is typically second + # this will be 2x checked later + ## EB: seond only if 4+ slots + dept_extract <- function(x) { + if (length(x) < 4) { + return(NA) + } else { + return(trimws(x[[2]])) + } + } + # + dept_list <- unlist(lapply(list_address, dept_extract)) + + # dept_list <- vapply(list_address, function(x) x[2], character(1)) + dept_list <- trimws(dept_list, which = "both") + + + +# Extract City ------------------------------------------------------------ + + # If there is only one element, then it can't have both city and country' + city_list <- vapply(list_address, function(x) { + n <- length(x) + if (n == 1) { + return("no city") # placeholder to replace with NA after function + } + + # In some cases city is next-to-last element, in others next-to-next-to-last + last_element <- x[[n]] + second_last <- if (n > 1) x[[n - 1]] else NA + third_last <- if (n > 2) x[[n - 2]] else NA + + # Check for India, China, & brazil, canada, australia, and UK. + # These countries' city is in multiple places + # This puts ina placeholder, which will be replaced later in + # the function that checks India and China + if(last_element %in% c("india", "china", + "brazil", "canada", "australia", + "scotland", "england", "wales", + "northern ireland")) + { + return("icb") # placeholder to replace with NA after function + } + + + if (grepl("usa",last_element)) { + return("icb") # placeholder to replace with NA after function + } + + # And of course a few other odd ones. This will check for + # other countries with specific rules. + if ((last_element == "australia" && second_last != "liverpool") || + last_element %in% c("wales") || + (last_element == "mexico" && second_last != "iztapalapa") || + (last_element == "argentina" && second_last == "df")) { + return(third_last) + } + + # Default case + return(second_last) + }, character(1)) + + # Cleanup + city_list <- trimws(city_list, which = "both") + city_list[city_list == "no city"] <- NA + city_list[city_list == "icb"] <- NA + + +# Extract Country --------------------------------------------------------- + + + country_list <- vapply(list_address, function(x) { + gsub("\\_", "", x[length(x)]) }, + character(1)) + +# Extracts zip & state from usa addresses --------------------------------- + + pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("usa", + country_list), function(x) x[1], numeric(1))) - 1), which = "right") + state_list <- pc_list + + state_list[nchar(state_list) > 0] <- regmatches( + state_list[nchar(state_list) > 0], + regexpr("[[:lower:]]{2}", state_list[nchar(state_list) > 0]) + ) + state_list[state_list == ""] <- NA + + pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2], + regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2])) + pc_list[nchar(pc_list) < 3] <- "" + pc_list[pc_list == ""] <- NA + + + +# USA -------------------------------------------------------------------- + + process_usa_address <- function(x) { + if (length(x) == 1) { + return(c(NA, NA)) # Not usa + } + if (grepl(" usa", x[[length(x)]])) { + if (length(x) == 4) { + return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) + } + if (length(x) == 5) { + return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) + } + if (length(x) == 3) { + return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) + } else { + return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) + } + } + + return(c(NA, NA)) # Not usa + } + + + # Apply the function across all addresses using `mapply` + results <- t(mapply(process_usa_address, list_address)) + colnames(results) <- c("usa_city", "usa_state") + + results<-as.data.frame(results) + results$pc<-NA + results$country<-NA + extract_usa_postcodes <- function(df, usa_state, pc,country) { + # 6 digits + pattern <- "[0-9]{5}" + + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, usa_state]) + # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) + # Extract the matches + extracted_codes <- regmatches(df[i, usa_state], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, pc])) { + df[i, pc] <- extracted_codes[[1]][1] + df[i, country] <- "usa" + # df[i, city_col] <- df[i, source_col] + + } + } + return(df) + } + + + results <- extract_usa_postcodes(results, "usa_state", "pc","country") + + + # any without numbers gets NA'd + results$pc[!grepl("\\d", results$pc)] <- NA + + # keep portions with numbers / remove city names + results$usa_city<-sub("[0-9]{5}","",results$usa_city) + results$usa_state<-sub("[0-9]{5}","",results$usa_state) + results$usa_state<-sub("usa","",results$usa_state) + results$usa_state<-trimws(results$usa_state, which = "both") + + results$country<-ifelse(is.na(results$usa_city)==FALSE,"usa",results$country) + + + + + # Update `city_list` if necessary + city_list <- ifelse(is.na(city_list), results$usa_city, city_list) + # + # # Update `state_list` if necessary + state_list<-ifelse(is.na(state_list),results$usa_state, state_list) + # + # # Update `pc_list` if necessary + pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) + # Update `country_list` if necessary + + country_list<-ifelse((grepl("usa",country_list)),"usa", country_list) + # remove any with "state_abbrev zip code" but no USA + country_list <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", country_list, ignore.case = TRUE), "usa", country_list) + + + + us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", + "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", + "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", + "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", + "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") + + country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) + rm(results) +# AUSTRALIA --------------------------------------------------------------- + + + ## Australia postal codes also separated + # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) + + # First need to "fix" the city + # Function to check for three characters or numbers in the city-list and replace with NA + process_aus_address <- function(x) { + if (length(x) == 1) { + return(c(NA, NA)) # Not Australia + } + + if (x[[length(x)]] == "australia") { + if (length(x) == 3) { + return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) + } else { + return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) + } + } + + return(c(NA, NA)) # Not Australia + } + + # Apply the function across all addresses using `mapply` + results <- t(mapply(process_aus_address, list_address)) + colnames(results) <- c("aus_city", "aus_state") + + # Clean up results + results <- as.data.frame(results, stringsAsFactors = FALSE) + results$aus_city[results$aus_city == "not australia"] <- NA + results$aus_state[results$aus_state == "not australia"] <- NA + + # take the PC+state and assign to PC + results$aus_pc<-results$aus_state + # remove all digits from state + results$aus_state <- trimws(gsub("\\d", "", results$aus_state)) + # remove all letters from pc + results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) + results$aus_pc[results$aus_pc == ""] <- NA + + # if na in PC, assign the city (some of which have PC) + results$aus_pc <- ifelse(is.na(results$aus_pc), results$aus_city, results$aus_pc) + # remove all metters from pc, leaving any new pc + results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) + results$aus_pc[results$aus_pc == ""] <- NA + # now remove any PC from city + results$aus_city <- trimws(gsub("\\d", "", results$aus_city)) #alternative + + # Update `city_list` if necessary + city_list <- ifelse(is.na(city_list), results$aus_city, city_list) + + # Update `state_list` if necessary + state_list<-ifelse(is.na(state_list),results$aus_state, state_list) + + # Update `pc_list` if necessary + pc_list<-ifelse(is.na(pc_list),results$aus_pc, pc_list) + + rm(results) + + + +# CANADA ------------------------------------------------------------------ + + ## Canada postal codes also separated + # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) + + process_can_address <- function(x) { + if (length(x) == 1) { + return(c(NA, NA)) # Not Canada + } + + if (x[[length(x)]] == "canada") { + if (length(x) == 3) { + return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) + } else { + return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) + } + } + + return(c(NA, NA)) # Not canada + } + + # Apply the function across all addresses using `mapply` + results <- t(mapply(process_can_address, list_address)) + colnames(results) <- c("can_city", "can_state") + + # Clean up results + results <- as.data.frame(results, stringsAsFactors = FALSE) + results$can_city[results$can_city == "not canada"] <- NA + results$can_state[results$can_state == "not canada"] <- NA + + # take the PC+state and assign to PC + results$can_pc<-results$can_state + + # any without numbers gets NA'd + results$can_pc[!grepl("\\d", results$can_pc)] <- NA + # removes state and removes ltr at start of PC + results$can_pc<-sub("[a-z]\\s+[a-z]", "", results$can_pc) #+[a-z] + + + # if na in PC, assign the city (some of which have PC) + results$can_pc <- ifelse(is.na(results$can_pc), results$can_city, results$can_pc) + results$can_pc[!grepl("\\d", results$can_pc)] <- NA + + # keep portions with numbers / remove city names + # .*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$: This regex pattern matches any + # characters (.*) followed by a word boundary (\\b) and exactly three word + # characters (\\w{3}), capturing this as the first group ((\\w{3})). It + # then matches any characters again (.*) followed by another word boundary + # and exactly three word characters, capturing this as the second + # group ((\\w{3})), and ensures this is at the end of the string ($). + # sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", string): Replaces the + # entire string with the two captured groups separated by a space. + results$can_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$can_pc) + + # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr + + results$can_pc[results$can_pc == ""] <- NA + # now remove any PC from city + results$can_city<-sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", results$can_city) # all after first space + + + # fix state + results$can_state <- trimws(gsub("\\d", "", results$can_state)) + results$can_state <-trimws(sub(" .*", "", results$can_state)) # all after first space + results$can_state <- gsub("british", "bc", results$can_state) + results$can_state <- gsub("nova", "ns", results$can_state) + # fix city + results$can_city <- trimws(gsub("\\d", "", results$can_city)) + # Update `city_list` if necessary + city_list <- ifelse(is.na(city_list), results$can_city, city_list) + + # Update `state_list` if necessary + state_list<-ifelse(is.na(state_list),results$can_state, state_list) + + # Update `pc_list` if necessary + pc_list<-ifelse(is.na(pc_list),results$can_pc, pc_list) + + rm(results) + +# INDIA ------------------------------------------------------------------ + + # India states are almost always listed but New Delhi is complicated, + # as are any with only three entries + # Function to process addresses for both city and state + process_india_address <- function(x) { + if (length(x) == 1) { + return(c(NA, NA)) # Not India + } + + if (x[[length(x)]] == "india") { + if (length(x) == 3) { + return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) + } else { + return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) + } + } + + if (length(x) != 3 && grepl("delhi", x[[length(x) - 1]])) { + return(c(trimws(x[[length(x) - 1]]), NA)) + } + + return(c(NA, NA)) # Not India + } + + # Apply the function across all addresses using `mapply` + results <- t(mapply(process_india_address, list_address)) + colnames(results) <- c("india_city", "india_state") + + # Clean up results + results <- as.data.frame(results, stringsAsFactors = FALSE) + results$india_city[results$india_city == "not india"] <- NA + results$india_state[results$india_state == "not india"] <- NA + + # Remove numeric parts from state names and trim whitespace + results$india_state <- trimws(gsub("\\s([0-9]+)", "", results$india_state)) + + # Update `city_list` if necessary + city_list <- ifelse(is.na(city_list), results$india_city, city_list) + #### + + state_list<-ifelse(is.na(state_list),results$india_state, state_list) + + rm(results) + + +# BRAZIL ------------------------------------------------------------------ + + + # Function to process addresses for both city and state + process_brazil_address <- function(x) { + if (length(x) == 1) { + return(c(NA, NA)) # Not brl + } + if (x[[length(x)]] == "brazil") { + if (length(x) == 3) { + return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) + } else { + return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) + } + } + + + return(c(NA, NA)) # Not brl + } + + + + # Apply the function across all addresses + results <- as.data.frame(t(mapply(process_brazil_address, list_address)), + stringsAsFactors = FALSE) + colnames(results) <- c("brl_city", "brl_state") + + + + + # Define words indicating this is actually a dept not state or postal code + # will use this list to delete the ones that don't apply + to_check <- c("dept","ctr","inst","ppg","andar","empresas", + "programa", "ciencias", "unidade", "lab ") + + results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), + results$brl_state, + results$brl_city) + + + + results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), + results$brl_city, + results$brl_state) + + + # Define the function + extract_brl_postcodes <- function(df, source_col, target_col,brl_new_city) { + # 6 digits + + pattern <- "br-[0-9]{5,8}" + + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, source_col]) + # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) + # Extract the matches + extracted_codes <- regmatches(df[i, source_col], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { + df[i, target_col] <- extracted_codes[[1]][1] + df[i, brl_new_city] <- df[i, source_col] + # df[i, city_col] <- df[i, source_col] + + } + } + return(df) + } + + results$brl_pc<-NA + results$brl_new_city<-NA + # df, source_col, target_col,city_col + results <- extract_brl_postcodes(results, "brl_city","brl_pc", "brl_new_city") + results <- extract_brl_postcodes(results, "brl_state","brl_pc", "brl_new_city") + + + results$brl_new_city <- ifelse(is.na(results$brl_new_city), + results$brl_state, + results$brl_new_city) + + + results$brl_state <- ifelse(results$brl_new_city==results$brl_state, + results$brl_city, + results$brl_state) + + + + + results$brl_new_city <- ifelse(is.na(results$brl_new_city), + results$brl_city, + results$brl_new_city) + + + + + results$brl_city<-gsub("br-","",results$brl_city) + results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) + results$brl_city<-sub("-","",results$brl_city) + + + results$brl_new_city<-gsub("br-","",results$brl_new_city) + results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) + results$brl_new_city<-sub("-","",results$brl_new_city) + + + results$brl_pc<-gsub("br-","",results$brl_pc) + results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) + results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) + + results$brl_state<-gsub("br-","",results$brl_state) + results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) + results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) + + # + # + # + # + # + # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) + # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) + # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) + # # + # results$brl_city<-gsub("-","",results$brl_city) + # + # results$brl_state<-gsub("br-","",results$brl_state) + # results$brl_state<-gsub("-","",results$brl_state) + + + + + + # any without numbers gets NA'd + results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA + + # keep portions with numbers / remove city names + + results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) + results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) + # + # # Specific replacements + # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), + # "rio de janeiro", results$brl_city) + + results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), + "rio de janeiro", results$brl_city) + results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), + "rj", results$brl_state) + results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), + "sp", results$brl_state) + + + + results$brl_city[results$brl_city==results$brl_state]<-NA + + + # Clean up and adjust columns + results[] <- lapply(results, trimws) + + + # Define city-to-state mapping + city_state_mapping <- data.frame( + city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), + state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), + stringsAsFactors = FALSE + ) + + # Match cities and states + for (i in 1:nrow(city_state_mapping)) { + results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), + city_state_mapping$city[i], results$brl_city) + results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), + city_state_mapping$state[i], results$brl_state) + } + + + # Match cities and states + for (i in 1:nrow(city_state_mapping)) { + + results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), + city_state_mapping$state[i], results$brl_state) + } + + + + results$brl_state <- ifelse(results$brl_new_city==results$brl_state, + results$brl_city, + results$brl_state) + + + results$brl_city <- trimws(results$brl_city, which = "both") + results$brl_state <- trimws(results$brl_state, which = "both") + # Define words indicating this is actually a dept not state or postal code + # will use this list to delete the ones that don't apply + to_check <- c("dept","ctr","inst","ppg", + "programa", "ciencias", "unidade", "lab ") + + results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), + results$brl_state, + results$brl_city) + + + # Final trimming + results[] <- lapply(results, trimws) + + results$brl_city <- ifelse(results$brl_new_city==results$brl_city, + NA, + results$brl_city) + + + # Update `city_list` if necessary + city_list <- ifelse(is.na(city_list), results$brl_city, city_list) + # Update `state_list` if necessary + state_list<-ifelse(is.na(state_list),results$brl_state, state_list) + # Update `pc_list` if necessary + pc_list<-ifelse(is.na(pc_list),results$brl_pc, pc_list) + + + rm(results,city_state_mapping) + + + + + + + + + + + + + + + + + + + + + # Handle postal codes (BR-[0-9]) + results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_state), results$brazil_state, NA) + results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_city) & is.na(results$brazil_pc), + results$brazil_city, results$brazil_pc) + + # Remove BR codes from city and state + results$brazil_city <- gsub("br-[0-9]+", "", results$brazil_city) + results$brazil_state <- gsub("br-[0-9]+", "", results$brazil_state) + + # Define city-to-state mapping + city_state_mapping <- data.frame( + city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "sao paulo"), + state = c("sp", "sp", "sp", "sp", "rj", "rj", "sp"), + stringsAsFactors = FALSE + ) + + # Match cities and states + for (i in 1:nrow(city_state_mapping)) { + results$brazil_city <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), + city_state_mapping$city[i], results$brazil_city) + results$brazil_state <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), + city_state_mapping$state[i], results$brazil_state) + } + + # Specific replacements + results$brazil_city <- ifelse(grepl("museu nacl", results$brazil_city), + "rio de janeiro", results$brazil_city) + results$brazil_state <- ifelse(grepl("rio de janeiro", results$brazil_state, ignore.case = TRUE), + "rj", results$brazil_state) + results$brazil_state <- ifelse(grepl("sao paulo", results$brazil_state, ignore.case = TRUE), + "sp", results$brazil_state) + + # cleanup + results$brazil_city[results$brazil_city==results$brazil_state]<-NA + results$brazil_city <- trimws(results$brazil_city, which = "both") + results$brazil_state <- trimws(results$brazil_state, which = "both") + # Define words indicating this is actually a dept not state or postal code + # will use this list to delete the ones that don't apply + to_check <- c("dept","ctr","inst","ppg", + "programa", "ciencias", "unidade", "lab ") + + results$brazil_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brazil_city), + results$brazil_state, + results$brazil_city) + + # Clean postal codes + results$brazil_pc <- gsub("[A-Za-z-]", "", results$brazil_pc) + + # Final trimming + results[] <- lapply(results, trimws) + + + # Update `city_list` if necessary + city_list <- ifelse(is.na(city_list), results$brazil_city, city_list) + # Update `state_list` if necessary + state_list<-ifelse(is.na(state_list),results$brazil_state, state_list) + # Update `pc_list` if necessary + pc_list<-ifelse(is.na(pc_list),results$brazil_pc, pc_list) + + + rm(results,city_state_mapping) + + +# CHINA ------------------------------------------------------------------- + chn_extract <- function(x) { + if (length(x) == 1) { + return(c(NA, NA)) + } else if (x[[length(x)]] == "china") { + return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) + } else { + return(c(NA, NA)) + } + } + + chn_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) + names(chn_pc) <- c("chn_city", "chn_state") + + # Define the function + extract_china_postcodes <- function(df, source_col, target_col,chn_new_city) { + # 6 digits + pattern <- "[0-9]{6}" + + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, source_col]) + # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) + # Extract the matches + extracted_codes <- regmatches(df[i, source_col], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { + df[i, target_col] <- extracted_codes[[1]][1] + df[i, chn_new_city] <- df[i, source_col] + # df[i, city_col] <- df[i, source_col] + + } + } + return(df) + } + + chn_pc$chn_pc<-NA + chn_pc$chn_new_city<-NA + # df, source_col, target_col,city_col + chn_pc <- extract_china_postcodes(chn_pc, "chn_city","chn_pc", "chn_new_city") + chn_pc <- extract_china_postcodes(chn_pc, "chn_state","chn_pc", "chn_new_city") + + + + # any without numbers gets NA'd + chn_pc$chn_pc[!grepl("\\d", chn_pc$chn_pc)] <- NA + + # keep portions with numbers / remove city names + chn_pc$chn_new_city<-sub("[0-9]{6}","",chn_pc$chn_new_city) + chn_pc$chn_city<-sub("[0-9]{6}","",chn_pc$chn_city) + chn_pc$chn_state<-sub("[0-9]{6}","",chn_pc$chn_state) + + + # Define words indicating this is actually a dept not state or postal code + # will use this list to delete the ones that don't apply + to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program","minist", "educ", "sch ", "grad ", "fac ", + "assoc","forest") + + + + chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")]<-lapply(chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")], trimws) + + + clean_column <- function(column, delete_terms) { + column <- gsub(paste(delete_terms, collapse = "|"), NA, column) + column <- gsub("[0-9]", "", column) # Remove digits + trimws(column) # Remove leading/trailing whitespace + } + + + # Clean chn_pc1 and chn_pc2 + chn_pc$chn_city <- clean_column(chn_pc$chn_city, to_delete) + chn_pc$chn_new_city <- clean_column(chn_pc$chn_new_city, to_delete) + chn_pc$chn_state <- clean_column(chn_pc$chn_state, to_delete) + + + chn_pc$chn_new_city <- ifelse(is.na(chn_pc$chn_new_city), + chn_pc$chn_state, + chn_pc$chn_new_city) + + + chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), + NA,chn_pc$chn_state) + + + + chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_city)==FALSE), + chn_pc$chn_city,chn_pc$chn_state) + + + + chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), + NA,chn_pc$chn_state) + + chn_pc$chn_state <- gsub(" province", "",chn_pc$chn_state) + + # Update `city_list` if necessary + city_list <- ifelse(is.na(city_list), chn_pc$chn_new_city, city_list) + + # Update `state_list` if necessary + state_list<-ifelse(is.na(state_list),chn_pc$chn_state, state_list) + + # Update `pc_list` if necessary + pc_list<-ifelse(is.na(pc_list),chn_pc$chn_pc, pc_list) + + rm(chn_pc) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# ### China has some where the postal code is with the city, so fix those here +# # Extract postal code information from the list +# chn_extract <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) +# } else if (x[[length(x)]] == "china") { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) +# } else { +# return(c(NA, NA)) +# } +# } + +# Apply extraction to list_address +# chn_missing_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) +# names(chn_missing_pc) <- c("chn_pc1", "chn_pc2") +# +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", +# "dept", "div", "univ", "hosp", "coll", "sci", "rd","program" +# "minist", "educ", "sch ", "grad ", "fac ","assoc") +# +# +# +# # Extract numeric postal codes +# chn_missing_pc$chn_pc_from1 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc1)) +# chn_missing_pc$chn_pc_from2 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc2)) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# clean_column <- function(column, delete_terms) { +# column <- gsub(paste(delete_terms, collapse = "|"), NA, column) +# column <- gsub("[0-9]", "", column) # Remove digits +# trimws(column) # Remove leading/trailing whitespace +# } +# +# # Clean chn_pc1 and chn_pc2 +# chn_missing_pc$chn_pc1 <- clean_column(chn_missing_pc$chn_pc1, to_delete) +# chn_missing_pc$chn_pc2 <- clean_column(chn_missing_pc$chn_pc2, to_delete) +# +# # Initialize empty columns for final outputs +# chn_missing_pc$chn_pc <- NA +# chn_missing_pc$chn_city <- NA +# chn_missing_pc$chn_state <- NA +# +# # Assign postal codes, cities, and states based on conditions +# assign_chn_data <- function(from1, from2, pc1, pc2) { +# list( +# chn_pc = ifelse(is.na(from1) & !is.na(from2), from2, from1), +# chn_city = ifelse(is.na(from1) & !is.na(from2), pc2, pc1), +# chn_state = ifelse(is.na(from1) & !is.na(from2), pc1, pc2) +# ) +# } +# +# chn_result <- assign_chn_data(chn_missing_pc$chn_pc_from1, +# chn_missing_pc$chn_pc_from2, +# chn_missing_pc$chn_pc1, +# chn_missing_pc$chn_pc2) +# +# chn_missing_pc$chn_pc <- chn_result$chn_pc +# chn_missing_pc$chn_city <- gsub("[0-9]", "", chn_result$chn_city) +# chn_missing_pc$chn_state <- gsub("[0-9]", "", chn_result$chn_state) + +# # Define Chinese states and cities +# chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", +# "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", +# "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", +# "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", +# "gansu", "inner mongolia", "jilin", "hainan", "ningxia", +# "qinghai", "tibet", "macao") +# +# # All the cities in the addresses, add as needed. +# chn_cities <- unique(c(chn_missing_pc$chn_city, "lhasa")) + +# Update states and cities based on matching conditions +# chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc1 %in% chn_states, +# chn_missing_pc$chn_pc1, chn_missing_pc$chn_state) +# chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc2 %in% chn_states, +# chn_missing_pc$chn_pc2, chn_missing_pc$chn_state) +# +# chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc1 %in% chn_states), +# chn_missing_pc$chn_pc1, chn_missing_pc$chn_city) +# chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc2 %in% chn_states), +# chn_missing_pc$chn_pc2, chn_missing_pc$chn_city) +# +# # put the postal codes and cities in the pc_list, state_list +# pc_list<-ifelse(is.na(pc_list),chn_missing_pc$chn_pc, pc_list) +# city_list<-ifelse(is.na(city_list),chn_missing_pc$chn_city, city_list) +# state_list<-ifelse(is.na(state_list),chn_missing_pc$chn_state, state_list) +# +# +# rm(chn_cities,chn_states,to_delete,chn_result,chn_missing_pc) + + + + + + + + + +# UK ------------------------------------------------------------------ + +process_uk_address <- function(x) { + if (length(x) == 1) { + return(c(NA, NA)) # Not uk + } + + if ((x[[length(x)]] == "england")| + (x[[length(x)]] == "scotland")| + (x[[length(x)]] == "wales")| + (x[[length(x)]] == "northern ireland")) { + if (length(x) == 3) { + return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) + } else { + return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) + } + } + + return(c(NA, NA)) # Not uk +} + +# Apply the function across all addresses using `mapply` +results <- t(mapply(process_uk_address, list_address)) +colnames(results) <- c("uk_city", "uk_state") + + + + +# Clean up results +results <- as.data.frame(results, stringsAsFactors = FALSE) +results$uk_city[results$uk_city == "not uk"] <- NA +results$uk_state[results$uk_state == "not uk"] <- NA + + + +results$uk_pc<-NA + +# Define the function +extract_uk_postcodes <- function(df, source_col, target_col,city_col) { + # Regular expression pattern for UK postal codes + # One or two initial letters. + # One or two digits (and possibly a letter). + # A mandatory space. + # One digit followed by two letters. + pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" + pattern2 <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" + pattern3 <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" + + + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, source_col]) + # Extract the matches + extracted_codes <- regmatches(df[i, source_col], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { + df[i, target_col] <- extracted_codes[[1]][1] + df[i, city_col] <- df[i, source_col] + + } + } + return(df) +} + +# Example usage + +results <- extract_uk_postcodes(results, "uk_city","uk_pc", "uk_city") +results <- extract_uk_postcodes(results, "uk_state","uk_pc", "uk_city") + +# any without numbers gets NA'd +results$uk_pc[!grepl("\\d", results$uk_pc)] <- NA + + + +results$new_city<-NA + + +# Define the function +uk_city_id <- function(df, source_col, target_col) { + # Regular expression pattern for UK postal codes + # One or two initial letters. + # One or two digits (and possibly a letter). + # A mandatory space. + # One digit followed by two letters. + pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" + + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, source_col]) + # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) + # Extract the matches + extracted_codes <- regmatches(df[i, source_col], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { + df[i, target_col] <- df[i, source_col] + } + } + return(df) +} + +# Example usage + +results <- uk_city_id(results, "uk_city","new_city") +results <- uk_city_id(results, "uk_state","new_city") + + +results$new_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$new_city) + + + + +# Define the function +uk_city_id <- function(df, source_col, target_col) { + # Regular expression pattern for UK postal codes + # One or two initial letters. + # One or two digits (and possibly a letter). + # A mandatory space. + # One digit followed by two letters. + pattern <- "\\s[A-Za-z0-9]{2,4}\\s[A-Za-z0-9]{2,4}" + + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, source_col]) + # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) + # Extract the matches + extracted_codes <- regmatches(df[i, source_col], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { + df[i, target_col] <- df[i, source_col] + } + } + return(df) +} + +# Example usage +results <- uk_city_id(results, "uk_state","new_city") + + + +results$uk_state<-ifelse(results$uk_state==results$uk_city, + "",results$uk_state) + +results$new_city<-ifelse(is.na(results$new_city), + results$uk_city, + results$new_city) +# remove zip codes from new city +results$new_city<-gsub("\\s[A-Za-z0-9]{3}\\s[A-Za-z0-9]{3}","",results$new_city) +results$new_city<-gsub("\\s[A-Za-z0-9]{4}\\s[A-Za-z0-9]{2,3}","",results$new_city) + + +results$new_city<-ifelse(results$uk_state=="london", + "london", + results$new_city) + + +results$uk_state<-ifelse(results$uk_state=="london", + NA, + results$uk_state) + + + + + + + + + + +# keep portions with numbers / remove city names +# results$uk_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_city) +# results$uk_state<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_state) +# results$uk_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$uk_pc) + +# results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr + +results$uk_pc[results$uk_pc == ""] <- NA +# now remove any PC from city + +# Update `city_list` if necessary +city_list <- ifelse(is.na(city_list), results$new_city, city_list) + +# Update `state_list` if necessary +state_list<-ifelse(is.na(state_list),results$uk_state, state_list) + +# Update `pc_list` if necessary +pc_list<-ifelse(is.na(pc_list),results$uk_pc, pc_list) + +rm(results) + + + + + + + + + + + + + + + + + + + + +# Extracts postal code when combined with city name ----------------------- + +city_list <- trimws(city_list, which = "both") + + + + city_clean<-data.frame( + authorID=ID, + addresses=addresses, + original_city=city_list, + city_list=city_list, + state_list=state_list, + country_list=country_list, + extract_pc=pc_list) + + + # # England, Scotland, Wales --------------------------------------------- + # + # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc)& (city_clean$country_list=="england"| + # city_clean$country_list=="wales" | + # city_clean$country_list=="scotland"), + # gsub(".*\\s([a-z0-9]+\\s[a-z0-9]+)$", "\\1", city_clean$city_list), + # city_clean$extract_pc) + # + # # This then deletes the postal code from the city name + # city_clean$city_list<-ifelse((city_clean$country_list=="england"| + # city_clean$country_list=="wales" | + # city_clean$country_list=="scotland"), + # (gsub("([A-Za-z0-9]+\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), + # city_clean$city_list) + # + city_clean[city_clean == ""] <- NA + + # Define the function + extract_postcodes <- function(df, source_col, target_col) { + # One or two initial letters. + # mandatory dash + # several numbers + pattern <- "\\b[A-Za-z]{1,2}[-][0-9]{4,8}\\b" + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, source_col]) + # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) + # Extract the matches + extracted_codes <- regmatches(df[i, source_col], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { + df[i, target_col] <- extracted_codes[[1]][1] + } + } + return(df) + } + + # Example usage + + city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") + + + + # Define the function + extract_postcodes <- function(df, source_col, target_col) { + # One or two initial letters. + # mandatory dash + # several numbers + pattern <- " [0-9]{3,9}" + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, source_col]) + # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) + # Extract the matches + extracted_codes <- regmatches(df[i, source_col], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { + df[i, target_col] <- extracted_codes[[1]][1] + } + } + return(df) + } + + + # Example usage + + city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") + + + + # Define the function + delete_matching_text <- function(df, col_a, col_b) { + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Check if the value in column A is not NA and is found within the text in column B + if(!is.na(df[i, col_a]) && grepl(df[i, col_a], df[i, col_b])) { + # Remove the matching text from column B by replacing it with an empty string + df[i, col_b] <- gsub(df[i, col_a], "", df[i, col_b]) + } + } + return(df) + } + + city_clean <- delete_matching_text(city_clean, "extract_pc","city_list") + + city_clean <- delete_matching_text(city_clean, "extract_pc","state_list") + + + # remove state if same as city + + city_clean$state_list<-ifelse(city_clean$state_list==city_clean$city_list,NA, city_clean$state_list) + + + # there are some usa ones that had state zip buyt no country + + + # Define the function + extract_postcodes <- function(df, country_list, extract_pc, state_list) { + # One or two initial letters. + # mandatory dash + # several numbers + pattern <- "\\b[A-Za-z]{2} [0-9]{5}\\b" + # Loop through each row of the dataframe + for(i in 1:nrow(df)) { + # Find all matches of the pattern in the source column + matches <- gregexpr(pattern, df[i, country_list]) + # Extract the matches + extracted_codes <- regmatches(df[i, country_list], matches) + # If there's at least one match and the target column is NA, copy the first match to the target column + if(length(extracted_codes[[1]]) > 0 && is.na(df[i, extract_pc])) { + df[i, extract_pc] <- extracted_codes[[1]][1] + df[i, state_list] <- extracted_codes[[1]][1] + df[i, country_list] <- "usa" + } + } + return(df) + } + + + # Example usage + + city_clean <- extract_postcodes(city_clean, + "country_list", "extract_pc", "state_list") + + + # remove zip codes from states + city_clean$state_list<-ifelse(city_clean$country_list=="usa", + gsub("[0-9]","",city_clean$state_list), + city_clean$state_list) + + # remove state from zipcode + city_clean$extract_pc<-ifelse(city_clean$country_list=="usa", + gsub("[a-z]","",city_clean$extract_pc), + city_clean$extract_pc) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + city_clean$extract_pc <- trimws(gsub("&", "", city_clean$extract_pc)) + city_clean[city_clean == ""] <- NA + +# +# +# +# +# +head(city_clean) +# +# # Cleaning up city names with zip codes in them +# # take the zip code and put it in a new column before deleting +# +# +# +# +# # 2) Countries with postal code AFTER city -------------------------------- +# +# +# city_zip<-c("ireland","qatar","kazakhstan","peru","turkey","south korea", +# "japan","costa rica","mexico","new zealand","iran","thailand", +# "russia","spain","india","singapore","indonesia","chile", +# "finland","colombia","taiwan","saudi arabia","uruguay", +# "slovenia","spain") +# +# +# +# city_clean$extract_pc<-ifelse((is.na(city_clean$extract_pc)& (city_clean$country_list %in% city_zip)), +# (gsub(".*[A-Za-z]+", "", city_clean$city_list)), +# city_clean$extract_pc) +# # +# city_clean$city_list<-ifelse((city_clean$country_list %in% city_zip), +# gsub("\\s([0-9]+)", "", city_clean$city_list), +# city_clean$city_list) +# +# city_clean[city_clean == ""] <- NA +# +# +# # 3) Postal code and dash BEFORE city name -------------------------------- +# +# +# zip_dash<-c("finland","slovakia","austria","portugal","belgium", +# "spain","israel","czech republic","argentina","france", +# "sweden","switzerland","turkey","germany","italy", +# "lithuania","hungary","denmark","poland","norway", "iceland", +# "greece", "ukraine","estonia","latvia","luxembourg","lativa", +# "south africa","bulgaria","brazil") +# +# +# +# +# city_clean$extract_pc <- ifelse((is.na(city_clean$extract_pc) & +# (city_clean$country_list %in% zip_dash)), +# # gsub("\\s([A-Za-z]+)", "", city_clean$city_list), +# sub(" .*", "", city_clean$city_list), +# city_clean$extract_pc) +# +# +# city_clean$city_list<-ifelse((city_clean$country_list %in% zip_dash), +# gsub("[a-zA-Z]+-[0-9]+", "", city_clean$city_list), +# city_clean$city_list) +# +# +# city_clean[city_clean == ""] <- NA +# +# # 4) Netherlands Postal Code ---------------------------------------------- +# +# # Netherlands has Postal Code before +# # it is a combination of 2-3 blocks of letters and numbers +# city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="netherlands"), +# (gsub("^(([^ ]+ ){2}).*", "\\1", city_clean$city_list)), +# city_clean$extract_pc) +# city_clean$city_list<-ifelse((city_clean$country_list=="netherlands"), +# (gsub(".*\\s", "", city_clean$city_list)), +# city_clean$city_list) +# city_clean[city_clean == ""] <- NA +# +# # 5) Venezuela ----------------------------------------------------------- +# +# # Venezuela has postal code after, it is combo of letters and numbers +# city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="venezuela"), +# gsub(".*\\s([a-z0-9]+)$", "\\1", city_clean$city_list), +# city_clean$extract_pc) +# +# # This then deletes the postal code from the city name +# city_clean$city_list<-ifelse(city_clean$country_list=="venezuela", +# (gsub("(\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), +# city_clean$city_list) +# city_clean[city_clean == ""] <- NA +# +# +# +# # trim ws +# city_clean$extract_pc <- trimws(city_clean$extract_pc, which = "both") +# city_clean$city_list <- trimws(city_clean$city_list, which = "both") +# # This removes any that don't have numbers in them +# city_clean$extract_pc[!grepl("[0-9]", city_clean$extract_pc)] <- NA +# city_clean$city_list <- gsub("[0-9]+", "", city_clean$city_list) +# +# +# Final Clean Up ---------------------------------------------------------- + + # Russia + city_clean$city_list<-ifelse(city_clean$country_list=="russia", + (gsub("st Petersburg p", "st petersburg", city_clean$city_list)), + city_clean$city_list) + + + + # India + India_delete<- c("dept") + + city_clean$city_list <- ifelse(city_clean$country_list=="india", + gsub(paste(India_delete, collapse = "|"), NA, city_clean$city_list), + city_clean$city_list) + city_clean$city_list<-trimws(city_clean$city_list) # Remove leading/trailing whitespace + + # brazil + city_clean$extract_pc<-ifelse((city_clean$country_list=="brazil" & nchar(city_clean$extract_pc) < 5), + "",city_clean$extract_pc) + city_clean[city_clean == ""] <- NA + + brazil_delete<- c("barretos canc hosp","univ fed sao paulo", + "escola filosofia letras & ciencias humanas", + "hosp sirio libanes","perola byington hosp", + "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", + "lab","zoologia", "inst", "programa","ppg", "ppg") + + + city_clean$city_list <- ifelse(city_clean$country_list=="brazil", + gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), + city_clean$city_list) + + city_clean$city_list<-trimws(city_clean$city_list, which="both") # Remove leading/trailing whitespace + city_clean$state_list<-trimws(city_clean$state_list, which="both") # Remove leading/trailing whitespace + city_clean$country_list<-trimws(city_clean$country_list, which="both") # Remove leading/trailing whitespace) # Remove leading/trailing whitespace + # City Abbreviations + + city_clean$city_list<-ifelse(city_clean$city_list=="university pk", + "university park", + city_clean$city_list) + + + city_clean$city_list<-ifelse(city_clean$city_list=="n chicago", + "north chicago", + city_clean$city_list) + + city_clean$city_list<-ifelse(city_clean$city_list=="college pk", + "college park", + city_clean$city_list) + + city_clean$city_list<-ifelse(city_clean$city_list=="research triangle pk", + "research triangle park", + city_clean$city_list) + + city_clean$city_list<-ifelse(city_clean$city_list=="state coll", + "state college", + city_clean$city_list) + + # city corrections + city_clean$city_list<-ifelse((city_clean$city_list=="dehra dun" & city_clean$country_list == "india"), + "dehradun", + city_clean$city_list) + + city_clean$city_list<-ifelse((city_clean$city_list=="st john" & city_clean$country_list == "canada"), + "st. john's", + city_clean$city_list) + + city_clean$state_list<-ifelse(city_clean$state_list=="london ", + "london", + city_clean$state_list) + + city_clean$city_list<-ifelse((city_clean$state_list=="london" & city_clean$country_list == "england"), + "london", + city_clean$city_list) + + + city_clean$state_list<-ifelse(city_clean$state_list=="london", + NA, + city_clean$state_list) + + + + city_list<-city_clean$city_list + state_list<-city_clean$state_list + pc_list<-city_clean$extract_pc + country_list<-city_clean$country_list + + + rm(brazil_delete,India_delete) + + + # rm(city_clean) + + + pc_list[pc_list == ""] <- NA + city_list[city_list == ""] <- NA + state_list[state_list == ""] <- NA + dept_list[dept_list == ""] <- NA + country_list[country_list == ""] <- NA + # Create the df that will be returned + cleaned_ad<-data.frame(ID, + addresses, + university_list, + dept_list, + city_list, + country_list, + state_list, + pc_list) + + + + # names(cleaned_ad) + + + list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) + + # Because formats of address printing is different across platforms + # We are going to split using a tier system assuming first and last + # info is somewhat reliable and guess the other info from the + # remaining position of the info + + second_tier_list <- lapply(list_address1, function(x) x[length(x)]) + second_tier_list <- trimws(second_tier_list, which = "both") + second_tier_list[second_tier_list == "character(0)"] <- NA + + list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) + + third_tier_list <- lapply(list_address2, function(x) x[length(x)]) + third_tier_list <- trimws(third_tier_list, which = "both") + third_tier_list[third_tier_list == "character(0)"] <- NA + + # All remaining info is just shoved in this category + remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) + remain_list <- trimws(remain_list, which = "both") + remain_list[remain_list == "character(0)"] <- NA + + + # original + # a_df <- data.frame( + # adID = ID, university = university_list, + # country = country_list, + # state = state_list, postal_code = pc_list, city = NA, + # department = NA, second_tier = second_tier_list, + # third_tier = third_tier_list, + # remain = remain_list, address = addresses, + # stringsAsFactors = FALSE + # ) + + # EB EDIT + a_df_1 <- data.frame( + adID = ID, + university_1 = university_list, + university = university_list, + country_1 = country_list, + country = country_list, + state_1 = state_list, + state = state_list, + postal_code_1 = pc_list, + postal_code = pc_list, + city_1 = city_list, + city = city_list, + department_1 = dept_list, + department = dept_list, + second_tier = second_tier_list, + third_tier = third_tier_list, + remain = remain_list, + address = addresses, + stringsAsFactors = FALSE + ) + + a_df_1$city_list<-ifelse(is.na(a_df_1$city_1), + a_df_1$city, + a_df_1$city_1) + + a_df_1$postal_code_1<-ifelse(is.na(a_df_1$postal_code_1), + a_df_1$postal_code, + a_df_1$postal_code_1) + + + # Quebec Postal Code is PQ (Fr) or QC (En) but only QC is georeferenced + a_df_1$state_1<-ifelse(a_df_1$state_1=="pq" & a_df_1$country_1 == "canada", + "qc", + a_df_1$state_1) + + + + + + + + + + + + + + + + + + +a_df<-a_df_1 + +rm(a_df_1) + + + + # try to fix the usa spots, which vary in format than other countries + a_df$state<-ifelse(is.na(a_df$state),"",a_df$state) # added by eb to deal with index problem + a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0] + a_df$state[nchar(a_df$state) == 0] <- NA + a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA + a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) & + !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) & + !is.na(a_df$postal_code) & !is.na(a_df$state)] + # fix a US problem when usa is not tacked onto the end + + us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}" + a_df$state[ grepl(us_reg, a_df$country) ] <- + substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2) + + a_df$postal_code[ grepl(us_reg, a_df$country) ] <- + substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8) + + a_df$country[grepl(us_reg, a_df$country)] <- "usa" + + + a_df$state_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", + a_df$state, + a_df$state_1) + + a_df$postal_code_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", + a_df$postal_code, + a_df$postal_code_1) + + + a_df$country_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", + a_df$country, + a_df$country_1) + + + ########################## + # We'll use regular expression to pull zipcodes + # These formats differ by region + int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}" + int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]", + "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="") + int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}" + int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" + int <- paste(int1, int2, int3, int4, sep = "|") + + uk <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]", + "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="") + + mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well + + panama <- "[[:digit:]]{4}-[[:digit:]]{5}" + + zip_search <- paste0(int, "|", uk, "|", mexico, "|", panama) + + + + + + # ADDRD EB INSTEAD OF + a_df$city_1 <- ifelse(is.na(a_df$city_1),a_df$third_tier,a_df$city_1) + a_df$state_1 <- ifelse(is.na(a_df$state_1),a_df$second_tier,a_df$state_1) + a_df$postal_code_1 <- ifelse(is.na(a_df$postal_code_1),a_df$second_tier,a_df$postal_code_1) + + + # fix country - usa + # Function to remove everything before " usa" + remove_before_usa <- function(x) { + if (grepl(" usa", x)) { + return(sub(".*(?= usa)", "", x, perl = TRUE)) + } else { + return(x) + } + } + + # Apply the function to each element in the vector + a_df$country_1 <- sapply(a_df$country_1, remove_before_usa) + a_df$country_1 <- trimws(a_df$country_1, which = "both") + + + a_df$state_1 <- ifelse(a_df$country_1=="usa", + (trimws(gsub("[0-9]", "", a_df$state_1), which="both")), + a_df$state_1) + + a_df$postal_code_1 <- ifelse(a_df$country_1=="usa", + (trimws(gsub("[a-z]", "", a_df$postal_code_1), which="both")), + a_df$postal_code_1) + + + + ########################### + id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) & + a_df$address != "Could not be extracted"] + ########################### + + # We now iteratively run through the addresses using the concept that + # certain information always exists next to each other. + # Ex. city, state, country tend to exist next to each other. + # We use the position of the zipcode also to help guide us + # in where the information lies as well as how many fields were + # given to us. + for (i in id_run) { + found <- FALSE + row <- which(a_df$adID == i) + university <- a_df$university[row] + second_tier <- a_df$second_tier[row] + third_tier <- a_df$third_tier[row] + remain <- a_df$remain[row] + city <- a_df$city[row] + state <- a_df$state[row] + postal_code <- a_df$postal_code[row] + department <- a_df$department[row] + grepl(zip_search, second_tier) + grepl(zip_search, third_tier) + # 2nd tier + if (grepl(zip_search, second_tier)) { + found <- TRUE + postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier)) + city <- gsub(zip_search, "", second_tier) + department <- ifelse(is.na(remain), third_tier, remain) + } + # 3RD tiers + if (grepl(zip_search, third_tier) & !found) { + found <- TRUE + postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier)) + city <- gsub(zip_search, "", third_tier) + state <- second_tier + department <- remain + } + + if (!found) { + state <- second_tier + city <- third_tier + department <- remain + } + # To make university searching more efficient we'll override values + # based on if it has university/college in the name, + # where university overides college + override_univ <- grepl("\\buniv\\b|\\buniversi", + c(second_tier, third_tier, remain, city, university), + ignore.case = TRUE) & + !grepl("\\bdrv\\b|\\bdrive\\b", + c(second_tier, third_tier, remain, city, university), + ignore.case = TRUE) + + if (any(override_univ)) { + university <- + c(second_tier, third_tier, remain, city, university)[override_univ][1] + assign( + c("second_tier", "third_tier", "remain", "city", "university")[ + override_univ][1], + NA + ) + } + # only if university doesnt already exist + override_univ_col <- + grepl("\\bcol\\b|college|\\bcoll\\b", + c(second_tier, third_tier, remain, city, university), + ignore.case = TRUE) & + !grepl("\\bdrv\\b|\\bdrive\\b", + c(second_tier, third_tier, remain, city, university), + ignore.case = TRUE) + + if (!any(override_univ) & any(override_univ_col)) { + university <- + c(second_tier, third_tier, remain, city, university )[ + override_univ_col][1] + + assign( + c("second_tier", "third_tier", "remain", "city", "university")[ + override_univ_col][1], + NA + ) + } + # more risky, but institutions as well, just incase its not a university + override_univ_inst <- grepl("\\binst\\b|\\binstitut", + c(second_tier, third_tier, remain, city, university), + ignore.case = TRUE) + if ( + !any(override_univ) & !any(override_univ_col) & any(override_univ_inst) + ) { + department <- c(second_tier, third_tier, remain, city, university )[ + override_univ_inst][1] + + assign( + c("second_tier", "third_tier", "remain", "city", "university")[ + override_univ_inst][1], + NA + ) + } + + a_df$city[row] <- gsub("[[:digit:]]", "", city) + a_df$state[row] <- gsub("[[:digit:]]", "", state) + a_df$postal_code[row] <- postal_code + a_df$department[row] <- department + + + + #########################Clock############################### + total <- length(id_run) + pb <- utils::txtProgressBar(min = 0, max = total, style = 3) + utils::setTxtProgressBar(pb, which(id_run == i)) + ############################################################# + } + + + city_fix <- is.na(a_df$city) & !is.na(a_df$state) + a_df$city[city_fix] <- a_df$state[city_fix] + a_df$state[city_fix] <- NA + a_df$university[a_df$university == "Could not be extracted"] <- NA + a_df$country[a_df$country == "Could not be extracted"] <- NA + # a_df$country[a_df$country == "peoples r china"] <- "China" + # a_df$country[a_df$country == "U Arab Emirates"] <- "United Arab Emirates" + # a_df$country[a_df$country == "Mongol Peo Rep"] <- "Mongolia" + + a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <- + vapply(strsplit( + a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)], + "-"), + function(x) x[2], character(1) + ) + #strip periods from the ends of city,state,country + a_df$city <- gsub("\\.", "", a_df$city) + a_df$state <- gsub("\\.", "", a_df$state) + a_df$country <- gsub("\\.", "", a_df$country) + a_df$country[a_df$country == ""] <- NA + a_df$university[a_df$university == ""] <- NA + a_df$postal_code[a_df$postal_code == ""] <- NA + #convert to lower + for (l in 2:ncol(a_df)){ + a_df[, l] <- tolower(a_df[, l]) + } + + # Select columns + a_df <- a_df[, c("adID", + "university_1", + "country_1", + "state_1", + "postal_code_1", + "city_1", + "department_1", + "second_tier", + "third_tier", + "remain", + "address") + ] + + # Rename columns + colnames(a_df) <- c("adID", + "university", + "country", + "state", + "postal_code", + "city", + "department", + "second_tier", + "third_tier", + "remain", + "address") + + + # sometimes the postal code fails to prse out of state. canm use this + # when postal code is missing, but then need to remove + # Function to extract numbers from one column and copy them to another column + extract_numbers <- function(df, source_col, target_col) { + if (is.na(target_col)) { + + + df[[target_col]] <- gsub(".*?(\\d+).*", "\\1", df[[source_col]]) + df[[target_col]][!grepl("\\d", df[[source_col]])] <- NA + return(df) + + } else { + return(df) + } + + } + + # Apply the function to the dataframe + a_df <- extract_numbers(a_df, "state", "postal_code") + + + + # ther postal code and city are sometimes in tier3 + a_df$city <- ifelse(is.na(a_df$city), a_df$third_tier, a_df$city) + a_df$postal_code <- ifelse(is.na(a_df$postal_code), a_df$third_tier, a_df$postal_code) + + + + + + # Function to remove matching characters from col1 based on col2 + remove_matching <- function(col1, col2) { + pattern <- paste0("\\b", gsub("([\\W])", "\\\\\\1", col2), "\\b") + result <- sub(pattern, "", col1) + trimws(result) + } + + # Apply the function to each row + a_df$city <- mapply(remove_matching, a_df$city, a_df$postal_code) + a_df$state <- mapply(remove_matching, a_df$state, a_df$postal_code) + + + + library(tidyverse) + + + country<- a_df %>% + # mutate(city_match=(city==second_tier)) %>% + # filter(city_match==FALSE) %>% + distinct(country) %>% + mutate(summary=nchar(country)) %>% + arrange(country) + + + country_city<- a_df %>% + # mutate(city_match=(city==second_tier)) %>% + # filter(city_match==FALSE) %>% + distinct(country,city) %>% + mutate(city_char=nchar(city)) %>% + arrange(country,city) + + + + country_state<- a_df %>% + # mutate(city_match=(city==second_tier)) %>% + # filter(city_match==FALSE) %>% + distinct(country,state) %>% + mutate(city_char=nchar(state)) %>% + arrange(country,state) + + country_state_city<- a_df %>% + # mutate(city_match=(city==second_tier)) %>% + # filter(city_match==FALSE) %>% + distinct(country ,state,city) %>% + mutate(city_char=nchar(city)) %>% + arrange(country,state,city) + + + country_state_city_pc<- a_df %>% + # mutate(city_match=(city==second_tier)) %>% + # filter(city_match==FALSE) %>% + distinct(country ,state,postal_code,city) %>% + mutate(city_char=nchar(city)) %>% + arrange(country,state,postal_code,city) + + return(a_df) +} From 15a8b2f4cba1ac1bfaf09274fc0b1c4f6c4536c8 Mon Sep 17 00:00:00 2001 From: embruna Date: Thu, 23 Jan 2025 13:03:21 -0500 Subject: [PATCH 09/34] draft updates to authors_address --- R/authors_address.R | 134 +- R/authors_address_update.R | 4757 +++++++++++++++++++++++------------- R/authors_georef.R | 7 +- 3 files changed, 3186 insertions(+), 1712 deletions(-) diff --git a/R/authors_address.R b/R/authors_address.R index 2b74932..f8f096b 100644 --- a/R/authors_address.R +++ b/R/authors_address.R @@ -10,25 +10,45 @@ authors_address <- function(addresses, ID){ message("\nSplitting addresses\n") + addresses<-tolower(addresses) # make all lower case list_address <- strsplit(addresses, ",") + + + # remove punctuation ---------------------------------------- + ## First remove periods and trim white space from countries. + ## helps avoids mistakes later on + + remove_period_from_last <- function(list_address) { + lapply(list_address, function(x) { + if (length(x) > 0) { + x[length(x)] <- gsub("\\.$", "", x[length(x)]) + x[length(x)] <- trimws(x[length(x)], which = "both") + } + return(x) + }) + } + + list_address <- remove_period_from_last(list_address) + + university_list <- vapply(list_address, function(x) x[1], character(1)) country_list <- vapply(list_address, function(x) { gsub("\\_", "", x[length(x)]) }, character(1)) country_list <- trimws(country_list, which = "both") - pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("USA", + pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("usa", country_list), function(x) x[1], numeric(1))) - 1), which = "right") state_list <- pc_list state_list[nchar(state_list) > 0] <- regmatches( state_list[nchar(state_list) > 0], - regexpr("[[:upper:]]{2}", state_list[nchar(state_list) > 0]) + regexpr("[[:lower:]]{2}", state_list[nchar(state_list) > 0]) ) pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2], regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2])) pc_list[nchar(pc_list) < 3] <- "" - country_list <- ifelse(grepl("USA", country_list), "USA", country_list) + country_list <- ifelse(grepl("usa", country_list), "usa", country_list) list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) @@ -78,20 +98,114 @@ authors_address <- function(addresses, ID){ a_df$postal_code[ grepl(us_reg, a_df$country) ] <- substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8) - a_df$country[grepl(us_reg, a_df$country)] <- "USA" + a_df$country[grepl(us_reg, a_df$country)] <- "usa" + + + # Added by eb + + # USA ZIP CODES + + a_df$state<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$second_tier), + a_df$second_tier,a_df$state) + a_df$postal_code<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$second_tier), + a_df$second_tier,a_df$postal_code) + a_df$city<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$second_tier), + a_df$third_tier,a_df$city) + + a_df$state<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$state), + gsub("[[:digit:]]{5}","",a_df$state),a_df$state) + + a_df$postal_code<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$postal_code), + gsub("[[:alpha:]]{2}","",a_df$postal_code),a_df$postal_code) + + + # BRAZIL CODES + + a_df$postal_code<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2}-[0-9]{4,8}", a_df$second_tier), + a_df$second_tier,a_df$postal_code) + a_df$city<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2}-[0-9]{4,8}", a_df$second_tier), + a_df$second_tier,a_df$city) + + a_df$postal_code<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2}-[0-9]{4,8}", a_df$third_tier), + a_df$third_tier,a_df$postal_code) + a_df$city<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2}-[0-9]{4,8}", a_df$third_tier), + a_df$third_tier,a_df$city) + + - ########################## + a_df$postal_code<- ifelse(a_df$country=="brazil", + gsub(" .*","",a_df$postal_code),a_df$postal_code) + + a_df$city<- ifelse(a_df$country=="brazil", + gsub(".*br-[0-9]+ ", "", a_df$city),a_df$city) + + + a_df$state<- ifelse(a_df$country=="brazil" & is.na(a_df$city) & is.na(a_df$postal_code), + a_df$second_tier,a_df$state) + + a_df$city<- ifelse(a_df$country=="brazil" & is.na(a_df$city) & is.na(a_df$postal_code), + a_df$third_tier,a_df$city) + + a_df$state<- ifelse(a_df$country=="brazil" & nchar(a_df$second_tier)==2,a_df$second_tier,a_df$state) + + + a_df$postal_code<- ifelse(a_df$country=="brazil", + gsub("br-", "", a_df$postal_code),a_df$postal_code) + + + a_df$state<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2} [0-9]{5}", a_df$state), + gsub("[[:digit:]]{5}","",a_df$state),a_df$state) + + a_df$postal_code<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$postal_code), + gsub("[[:alpha:]]{2}","",a_df$postal_code),a_df$postal_code) + + + + # Define words indicating this is actually a dept not state or postal code + # will use this list to delete the ones that don't apply + to_check <- c("dept","ctr","inst","ppg","andar","empresas", + "cena","educ","programa", "ciencias", "unidade", "lab ") + + a_df$city <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$city), + a_df$second_tier, + a_df$city) + + + + a_df$state <- ifelse(a_df$country=="brazil" &grepl(paste(to_check, collapse = "|"), a_df$state), + a_df$second_tier,a_df$state) + + a_df$city<- ifelse(a_df$country=="brazil" & is.na(a_df$city) & is.na(a_df$postal_code), + a_df$second_tier,a_df$city) + + + a_df$city <- ifelse(grepl("museu nacl", a_df$city), + "rio de janeiro", a_df$city) + a_df$state <- ifelse(grepl("rio de janeiro", a_df$state, ignore.case = TRUE), + "rj", a_df$state) + a_df$state <- ifelse(grepl("sao paulo", a_df$state, ignore.case = TRUE), + "sp", a_df$state) + + + + results$brl_city[results$brl_city==results$brl_state]<-NA + + + # Clean up and adjust columns + results[] <- lapply(results, trimws) + + ########################## # We'll use regular expression to pull zipcodes # These formats differ by region int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}" - int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]", - "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="") + int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:lower:]]", + "[[:space:]][[:digit:]][[:lower:]][[:digit:]]", sep="") int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}" - int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" + int4 <- "[:lower:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" int <- paste(int1, int2, int3, int4, sep = "|") - UK <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]", - "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="") + UK <- paste("[[:lower:]]{1,2}[[:digit:]]{1,2}[[:space:]]", + "{1}[[:digit:]]{1}[[:lower:]]{2}", sep="") Mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well diff --git a/R/authors_address_update.R b/R/authors_address_update.R index 9fbd3d5..d22f8b1 100644 --- a/R/authors_address_update.R +++ b/R/authors_address_update.R @@ -18,20 +18,11 @@ authors_address <- function(addresses, ID){ addresses<-tolower(addresses) message("\nSplitting addresses\n") list_address <- strsplit(addresses, ",") - - -# trim ws ----------------------------------------------------------------- - - list_address <- lapply(list_address, trimws) - - - -# COUNTRIES --------------------------------------------------------------- - -# remove punctuation ---------------------------------------- + # remove punctuation ---------------------------------------- ## First remove periods and trim white space from countries. ## helps avoids mistakes later on + remove_period_from_last <- function(list_address) { lapply(list_address, function(x) { if (length(x) > 0) { @@ -41,11 +32,18 @@ authors_address <- function(addresses, ID){ return(x) }) } - + list_address <- remove_period_from_last(list_address) +# trim ws ----------------------------------------------------------------- + + list_address <- lapply(list_address, trimws) + -# correct names + +# correct countries ------------------------------------------------------- + + # correct names # Define the function correct_countries <- function(my_list, replacements) { # Loop through each element of the list @@ -79,41 +77,40 @@ authors_address <- function(addresses, ID){ "rep congo" = "congo", "north ireland" = "northern ireland", "syrian arab rep" = "syria" - ) - - + ) - list_address <- correct_countries(list_address, replacements) + list_address<- correct_countries(list_address, replacements) + # a_df$country <- correct_countries(a_df$country, replacements) -# Extract University ------------------------------------------------------ +# extract university ------------------------------------------------------ + university_list <- vapply(list_address, function(x) x[1], character(1)) - - -# Extract the Department -------------------------------------------------- - # If department is listed it is typically second - # this will be 2x checked later - ## EB: seond only if 4+ slots - dept_extract <- function(x) { - if (length(x) < 4) { - return(NA) - } else { - return(trimws(x[[2]])) +# extract department ------------------------------------------------------ + + # # If department is listed it is typically second + # # this will be 2x checked later + # ## EB: seond only if 4+ slots + dept_extract <- function(x) { + if (length(x) < 4) { + return(NA) + } else { + return(trimws(x[[2]])) + } } - } - # - dept_list <- unlist(lapply(list_address, dept_extract)) + # + dept_list <- unlist(lapply(list_address, dept_extract)) # dept_list <- vapply(list_address, function(x) x[2], character(1)) dept_list <- trimws(dept_list, which = "both") - -# Extract City ------------------------------------------------------------ - + + # Extract City ------------------------------------------------------------ + # If there is only one element, then it can't have both city and country' city_list <- vapply(list_address, function(x) { n <- length(x) @@ -130,27 +127,27 @@ authors_address <- function(addresses, ID){ # These countries' city is in multiple places # This puts ina placeholder, which will be replaced later in # the function that checks India and China - if(last_element %in% c("india", "china", - "brazil", "canada", "australia", - "scotland", "england", "wales", - "northern ireland")) - { - return("icb") # placeholder to replace with NA after function - } - - - if (grepl("usa",last_element)) { - return("icb") # placeholder to replace with NA after function - } + # if(last_element %in% c("india", "china", + # "brazil", "canada", "australia", + # "scotland", "england", "wales", + # "northern ireland")) + # { + # return("icb") # placeholder to replace with NA after function + # } + # + # + # if (grepl("usa",last_element)) { + # return("icb") # placeholder to replace with NA after function + # } # And of course a few other odd ones. This will check for # other countries with specific rules. - if ((last_element == "australia" && second_last != "liverpool") || - last_element %in% c("wales") || - (last_element == "mexico" && second_last != "iztapalapa") || - (last_element == "argentina" && second_last == "df")) { - return(third_last) - } + # if ((last_element == "australia" && second_last != "liverpool") || + # last_element %in% c("wales") || + # (last_element == "mexico" && second_last != "iztapalapa") || + # (last_element == "argentina" && second_last == "df")) { + # return(third_last) + # } # Default case return(second_last) @@ -162,472 +159,257 @@ authors_address <- function(addresses, ID){ city_list[city_list == "icb"] <- NA -# Extract Country --------------------------------------------------------- - - - country_list <- vapply(list_address, function(x) { - gsub("\\_", "", x[length(x)]) }, - character(1)) - -# Extracts zip & state from usa addresses --------------------------------- - - pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("usa", - country_list), function(x) x[1], numeric(1))) - 1), which = "right") - state_list <- pc_list +# extract state ----------------------------------------------------------- - state_list[nchar(state_list) > 0] <- regmatches( - state_list[nchar(state_list) > 0], - regexpr("[[:lower:]]{2}", state_list[nchar(state_list) > 0]) - ) - state_list[state_list == ""] <- NA - - pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2], - regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2])) - pc_list[nchar(pc_list) < 3] <- "" - pc_list[pc_list == ""] <- NA + # Extract City ------------------------------------------------------------ - - -# USA -------------------------------------------------------------------- - - process_usa_address <- function(x) { - if (length(x) == 1) { - return(c(NA, NA)) # Not usa - } - if (grepl(" usa", x[[length(x)]])) { - if (length(x) == 4) { - return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) - } - if (length(x) == 5) { - return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) - } - if (length(x) == 3) { - return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) - } else { - return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) - } + # If there is only one element, then it can't have both city and country' + state_list <- vapply(list_address, function(x) { + n <- length(x) + if (n == 1) { + return("no state") # placeholder to replace with NA after function } - return(c(NA, NA)) # Not usa - } - - - # Apply the function across all addresses using `mapply` - results <- t(mapply(process_usa_address, list_address)) - colnames(results) <- c("usa_city", "usa_state") - - results<-as.data.frame(results) - results$pc<-NA - results$country<-NA - extract_usa_postcodes <- function(df, usa_state, pc,country) { - # 6 digits - pattern <- "[0-9]{5}" + # In some cases city is next-to-last element, in others next-to-next-to-last + last_element <- x[[n]] + second_last <- if (n > 1) x[[n - 1]] else NA + third_last <- if (n > 2) x[[n - 2]] else NA - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, usa_state]) - # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) - # Extract the matches - extracted_codes <- regmatches(df[i, usa_state], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, pc])) { - df[i, pc] <- extracted_codes[[1]][1] - df[i, country] <- "usa" - # df[i, city_col] <- df[i, source_col] - - } - } - return(df) - } - - - results <- extract_usa_postcodes(results, "usa_state", "pc","country") - - - # any without numbers gets NA'd - results$pc[!grepl("\\d", results$pc)] <- NA - - # keep portions with numbers / remove city names - results$usa_city<-sub("[0-9]{5}","",results$usa_city) - results$usa_state<-sub("[0-9]{5}","",results$usa_state) - results$usa_state<-sub("usa","",results$usa_state) - results$usa_state<-trimws(results$usa_state, which = "both") - - results$country<-ifelse(is.na(results$usa_city)==FALSE,"usa",results$country) - - + # Check for India, China, & brazil, canada, australia, and UK. + # These countries' city is in multiple places + # This puts ina placeholder, which will be replaced later in + # the function that checks India and China + # if(last_element %in% c("india", "china", + # "brazil", "canada", "australia", + # "scotland", "england", "wales", + # "northern ireland")) + # { + # return("icb") # placeholder to replace with NA after function + # } + # + # + # if (grepl("usa",last_element)) { + # return("icb") # placeholder to replace with NA after function + # } + + # And of course a few other odd ones. This will check for + # other countries with specific rules. + # if ((last_element == "australia" && second_last != "liverpool") || + # last_element %in% c("wales") || + # (last_element == "mexico" && second_last != "iztapalapa") || + # (last_element == "argentina" && second_last == "df")) { + # return(third_last) + # } + + # Default case + return(third_last) + }, character(1)) + # Cleanup + state_list <- trimws(state_list, which = "both") + state_list[state_list == "no city"] <- NA + state_list[state_list == "icb"] <- NA - # Update `city_list` if necessary - city_list <- ifelse(is.na(city_list), results$usa_city, city_list) - # - # # Update `state_list` if necessary - state_list<-ifelse(is.na(state_list),results$usa_state, state_list) - # - # # Update `pc_list` if necessary - pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) - # Update `country_list` if necessary - country_list<-ifelse((grepl("usa",country_list)),"usa", country_list) - # remove any with "state_abbrev zip code" but no USA - country_list <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", country_list, ignore.case = TRUE), "usa", country_list) + city_list2 <- trimws(state_list, which = "both") + # Extract Country --------------------------------------------------------- + country_list <- vapply(list_address, function(x) { + gsub("\\_", "", x[length(x)]) }, + character(1)) - us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", - "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", - "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", - "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", - "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") - country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) - rm(results) -# AUSTRALIA --------------------------------------------------------------- - - ## Australia postal codes also separated - # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) - - # First need to "fix" the city - # Function to check for three characters or numbers in the city-list and replace with NA - process_aus_address <- function(x) { - if (length(x) == 1) { - return(c(NA, NA)) # Not Australia - } - - if (x[[length(x)]] == "australia") { - if (length(x) == 3) { - return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) - } else { - return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) - } - } - - return(c(NA, NA)) # Not Australia - } - - # Apply the function across all addresses using `mapply` - results <- t(mapply(process_aus_address, list_address)) - colnames(results) <- c("aus_city", "aus_state") - - # Clean up results - results <- as.data.frame(results, stringsAsFactors = FALSE) - results$aus_city[results$aus_city == "not australia"] <- NA - results$aus_state[results$aus_state == "not australia"] <- NA - - # take the PC+state and assign to PC - results$aus_pc<-results$aus_state - # remove all digits from state - results$aus_state <- trimws(gsub("\\d", "", results$aus_state)) - # remove all letters from pc - results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) - results$aus_pc[results$aus_pc == ""] <- NA - - # if na in PC, assign the city (some of which have PC) - results$aus_pc <- ifelse(is.na(results$aus_pc), results$aus_city, results$aus_pc) - # remove all metters from pc, leaving any new pc - results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) - results$aus_pc[results$aus_pc == ""] <- NA - # now remove any PC from city - results$aus_city <- trimws(gsub("\\d", "", results$aus_city)) #alternative - - # Update `city_list` if necessary - city_list <- ifelse(is.na(city_list), results$aus_city, city_list) - - # Update `state_list` if necessary - state_list<-ifelse(is.na(state_list),results$aus_state, state_list) - - # Update `pc_list` if necessary - pc_list<-ifelse(is.na(pc_list),results$aus_pc, pc_list) +# pc list ----------------------------------------------------------------- - rm(results) - +# pc often with city + pc_list<-city_list -# CANADA ------------------------------------------------------------------ +# bind into df ------------------------------------------------------------ - ## Canada postal codes also separated - # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) - - process_can_address <- function(x) { - if (length(x) == 1) { - return(c(NA, NA)) # Not Canada - } - - if (x[[length(x)]] == "canada") { - if (length(x) == 3) { - return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) - } else { - return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) - } - } - - return(c(NA, NA)) # Not canada - } - - # Apply the function across all addresses using `mapply` - results <- t(mapply(process_can_address, list_address)) - colnames(results) <- c("can_city", "can_state") - # Clean up results - results <- as.data.frame(results, stringsAsFactors = FALSE) - results$can_city[results$can_city == "not canada"] <- NA - results$can_state[results$can_state == "not canada"] <- NA - - # take the PC+state and assign to PC - results$can_pc<-results$can_state - - # any without numbers gets NA'd - results$can_pc[!grepl("\\d", results$can_pc)] <- NA - # removes state and removes ltr at start of PC - results$can_pc<-sub("[a-z]\\s+[a-z]", "", results$can_pc) #+[a-z] - - - # if na in PC, assign the city (some of which have PC) - results$can_pc <- ifelse(is.na(results$can_pc), results$can_city, results$can_pc) - results$can_pc[!grepl("\\d", results$can_pc)] <- NA - - # keep portions with numbers / remove city names - # .*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$: This regex pattern matches any - # characters (.*) followed by a word boundary (\\b) and exactly three word - # characters (\\w{3}), capturing this as the first group ((\\w{3})). It - # then matches any characters again (.*) followed by another word boundary - # and exactly three word characters, capturing this as the second - # group ((\\w{3})), and ensures this is at the end of the string ($). - # sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", string): Replaces the - # entire string with the two captured groups separated by a space. - results$can_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$can_pc) - - # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr + a_df <- data.frame( + adID = ID, + university = university_list, + country = country_list, + state = state_list, + postal_code = pc_list, + city = city_list, + city2 = city_list2, + department = dept_list, + address = addresses, + stringsAsFactors = FALSE + ) - results$can_pc[results$can_pc == ""] <- NA - # now remove any PC from city - results$can_city<-sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", results$can_city) # all after first space - - # fix state - results$can_state <- trimws(gsub("\\d", "", results$can_state)) - results$can_state <-trimws(sub(" .*", "", results$can_state)) # all after first space - results$can_state <- gsub("british", "bc", results$can_state) - results$can_state <- gsub("nova", "ns", results$can_state) - # fix city - results$can_city <- trimws(gsub("\\d", "", results$can_city)) - # Update `city_list` if necessary - city_list <- ifelse(is.na(city_list), results$can_city, city_list) - # Update `state_list` if necessary - state_list<-ifelse(is.na(state_list),results$can_state, state_list) - # Update `pc_list` if necessary - pc_list<-ifelse(is.na(pc_list),results$can_pc, pc_list) + # any PC without numbers gets NA'd + a_df$postal_code[!grepl("\\d", a_df$postal_code)] <- NA - rm(results) - -# INDIA ------------------------------------------------------------------ - - # India states are almost always listed but New Delhi is complicated, - # as are any with only three entries - # Function to process addresses for both city and state - process_india_address <- function(x) { - if (length(x) == 1) { - return(c(NA, NA)) # Not India - } - - if (x[[length(x)]] == "india") { - if (length(x) == 3) { - return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) - } else { - return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) - } - } - - if (length(x) != 3 && grepl("delhi", x[[length(x) - 1]])) { - return(c(trimws(x[[length(x) - 1]]), NA)) - } - - return(c(NA, NA)) # Not India - } + # copy over PC and state + a_df$state<- ifelse(grepl("usa",a_df$country) & nchar(a_df$state)>2, + NA, + a_df$state) - # Apply the function across all addresses using `mapply` - results <- t(mapply(process_india_address, list_address)) - colnames(results) <- c("india_city", "india_state") - # Clean up results - results <- as.data.frame(results, stringsAsFactors = FALSE) - results$india_city[results$india_city == "not india"] <- NA - results$india_state[results$india_state == "not india"] <- NA + a_df$postal_code<- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), + a_df$country,a_df$postal_code) - # Remove numeric parts from state names and trim whitespace - results$india_state <- trimws(gsub("\\s([0-9]+)", "", results$india_state)) + a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), + a_df$country,a_df$state) - # Update `city_list` if necessary - city_list <- ifelse(is.na(city_list), results$india_city, city_list) - #### + a_df$state<- ifelse(grepl("[a-z]{2} [0-9]{5}", a_df$city), + a_df$city,a_df$state) - state_list<-ifelse(is.na(state_list),results$india_state, state_list) - rm(results) + a_df$state<- ifelse(grepl("[a-z]{2} usa", a_df$country), + a_df$country,a_df$state) - -# BRAZIL ------------------------------------------------------------------ - - - # Function to process addresses for both city and state - process_brazil_address <- function(x) { - if (length(x) == 1) { - return(c(NA, NA)) # Not brl - } - if (x[[length(x)]] == "brazil") { - if (length(x) == 3) { - return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) - } else { - return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) - } - } - - - return(c(NA, NA)) # Not brl - } + # remove the numbers and letters as appropriate - - # Apply the function across all addresses - results <- as.data.frame(t(mapply(process_brazil_address, list_address)), - stringsAsFactors = FALSE) - colnames(results) <- c("brl_city", "brl_state") + a_df$country<- ifelse(grepl(" usa", a_df$country), + "usa",a_df$country) + a_df$state<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$state), + gsub("[[:digit:]]{5}","",a_df$state),a_df$state) + a_df$state<- ifelse(a_df$country=="usa" & grepl(" usa", a_df$state), + gsub(" usa","",a_df$state),a_df$state) + + a_df$postal_code<- ifelse(a_df$country=="usa", + gsub("[[:alpha:]]{2} ","", + a_df$postal_code),a_df$postal_code) - # Define words indicating this is actually a dept not state or postal code - # will use this list to delete the ones that don't apply - to_check <- c("dept","ctr","inst","ppg","andar","empresas", - "programa", "ciencias", "unidade", "lab ") + a_df$postal_code<- ifelse(a_df$country=="usa", + gsub(" usa","", + a_df$postal_code),a_df$postal_code) - results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), - results$brl_state, - results$brl_city) - results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), - results$brl_city, - results$brl_state) + a_df$city<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$city), + a_df$city2,a_df$city) - # Define the function - extract_brl_postcodes <- function(df, source_col, target_col,brl_new_city) { - # 6 digits - - pattern <- "br-[0-9]{5,8}" - - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, source_col]) - # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) - # Extract the matches - extracted_codes <- regmatches(df[i, source_col], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { - df[i, target_col] <- extracted_codes[[1]][1] - df[i, brl_new_city] <- df[i, source_col] - # df[i, city_col] <- df[i, source_col] - - } - } - return(df) - } + pattern <- "[a-z]{2} [0-9]{5}" - results$brl_pc<-NA - results$brl_new_city<-NA - # df, source_col, target_col,city_col - results <- extract_brl_postcodes(results, "brl_city","brl_pc", "brl_new_city") - results <- extract_brl_postcodes(results, "brl_state","brl_pc", "brl_new_city") + a_df$postal_code<- ifelse(grepl(pattern, a_df$country), + a_df$country,a_df$postal_code) + a_df$state<- ifelse(grepl(pattern, a_df$country), + a_df$country,a_df$state) + a_df$country<- ifelse(grepl(pattern, a_df$country), + "usa",a_df$country) + a_df$postal_code<- ifelse(a_df$country=="usa" & grepl(pattern, a_df$postal_code), + gsub("[a-z]","",a_df$postal_code),a_df$postal_code) + a_df$state<- ifelse(a_df$country=="usa" & grepl(pattern, a_df$state), + gsub("[0-9]","",a_df$postal_code),a_df$state) + + #TODO: correct this to catch any that didn;t get caught - results$brl_new_city <- ifelse(is.na(results$brl_new_city), - results$brl_state, - results$brl_new_city) + # + # us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", + # "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", + # "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", + # "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", + # "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") + # + # country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) - results$brl_state <- ifelse(results$brl_new_city==results$brl_state, - results$brl_city, - results$brl_state) +# brazil clean-up --------------------------------------------------------- + + a_df$state<- ifelse(a_df$country=="brazil" & nchar(a_df$city)==2, + a_df$city,a_df$state) + a_df$city<- ifelse(a_df$country=="brazil" & nchar(a_df$city)==2, + a_df$city2,a_df$city) + a_df$city2<- ifelse(a_df$country=="brazil" & a_df$city==a_df$city2, + NA,a_df$city2) + a_df$postal_code<- ifelse(a_df$country=="brazil" & is.na(a_df$postal_code), + a_df$city,a_df$postal_code) + a_df$state<- ifelse(a_df$country=="brazil" & nchar(a_df$state)>2, + NA,a_df$state) + a_df$postal_code<- ifelse(a_df$country=="brazil", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code) + a_df$postal_code<- ifelse(a_df$country=="brazil", + gsub("[-]", "", a_df$postal_code), + a_df$postal_code) - results$brl_new_city <- ifelse(is.na(results$brl_new_city), - results$brl_city, - results$brl_new_city) + a_df$city<- ifelse(a_df$country=="brazil", + gsub("br-", "", a_df$city), + a_df$city) + a_df$city<- ifelse(a_df$country=="brazil", + gsub("[0-9]", "", a_df$city), + a_df$city) + a_df$state<- ifelse(a_df$country=="brazil" & grepl("br-", a_df$city2), + a_df$city,a_df$state) + a_df$postal_code<- ifelse(a_df$country=="brazil" & grepl("br-", a_df$city2), + a_df$city2,a_df$postal_code) + a_df$city<- ifelse(a_df$country=="brazil" & grepl("br-", a_df$city2), + a_df$city2,a_df$city) - results$brl_city<-gsub("br-","",results$brl_city) - results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) - results$brl_city<-sub("-","",results$brl_city) + # repeat the clean of city + a_df$city<- ifelse(a_df$country=="brazil", + gsub("br-", "", a_df$city), + a_df$city) + a_df$city<- ifelse(a_df$country=="brazil", + gsub("[0-9]", "", a_df$city), + a_df$city) - results$brl_new_city<-gsub("br-","",results$brl_new_city) - results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) - results$brl_new_city<-sub("-","",results$brl_new_city) + a_df$postal_code<- ifelse(a_df$country=="brazil", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code) + a_df$postal_code<- ifelse(a_df$country=="brazil", + gsub("[-]", "", a_df$postal_code), + a_df$postal_code) + a_df[] <- lapply(a_df, trimws) - results$brl_pc<-gsub("br-","",results$brl_pc) - results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) - results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) - results$brl_state<-gsub("br-","",results$brl_state) - results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) - results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) + # # Define words indicating this is actually a dept not state or postal code + # # will use this list to delete the ones that don't apply + # to_check <- c("dept","ctr","inst","ppg","andar","empresas", + # "programa", "ciencias", "unidade", "lab ") # + # a_df$city <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$city), + # a_df$state, + # results$brl_city) # # # - # - # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) - # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) - # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) - # # - # results$brl_city<-gsub("-","",results$brl_city) - # - # results$brl_state<-gsub("br-","",results$brl_state) - # results$brl_state<-gsub("-","",results$brl_state) - - - - + # results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), + # results$brl_city, + # results$brl_state) - # any without numbers gets NA'd - results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA - - # keep portions with numbers / remove city names - - results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) - results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) # - # # Specific replacements - # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), + # results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), # "rio de janeiro", results$brl_city) - - results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), - "rio de janeiro", results$brl_city) - results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), - "rj", results$brl_state) - results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), - "sp", results$brl_state) - + # results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), + # "rj", results$brl_state) + # results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), + # "sp", results$brl_state) + # - results$brl_city[results$brl_city==results$brl_state]<-NA + # results$brl_city[results$brl_city==results$brl_state]<-NA # Clean up and adjust columns - results[] <- lapply(results, trimws) + # results[] <- lapply(results, trimws) # Define city-to-state mapping @@ -639,191 +421,254 @@ authors_address <- function(addresses, ID){ # Match cities and states for (i in 1:nrow(city_state_mapping)) { - results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), - city_state_mapping$city[i], results$brl_city) - results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), - city_state_mapping$state[i], results$brl_state) + a_df$city <- ifelse(a_df$country=="brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), + city_state_mapping$city[i], a_df$city) + a_df$state <- ifelse(a_df$country=="brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), + city_state_mapping$state[i], a_df$state) } - # Match cities and states for (i in 1:nrow(city_state_mapping)) { - - results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), - city_state_mapping$state[i], results$brl_state) + a_df$state <- ifelse(a_df$country=="brazil" & grepl(city_state_mapping$city[i], a_df$city, ignore.case = TRUE), + city_state_mapping$state[i], a_df$state) } - results$brl_state <- ifelse(results$brl_new_city==results$brl_state, - results$brl_city, - results$brl_state) + # brazil_delete<- c("barretos canc hosp","univ fed sao paulo", + # "escola filosofia letras & ciencias humanas", + # "hosp sirio libanes","perola byington hosp", + # "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", + # "lab","zoologia", "inst", "programa","ppg", "ppg") - results$brl_city <- trimws(results$brl_city, which = "both") - results$brl_state <- trimws(results$brl_state, which = "both") - # Define words indicating this is actually a dept not state or postal code - # will use this list to delete the ones that don't apply - to_check <- c("dept","ctr","inst","ppg", - "programa", "ciencias", "unidade", "lab ") + # city_clean$city_list <- ifelse(city_clean$country_list=="brazil", + # gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), + # city_clean$city_list) - results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), - results$brl_state, - results$brl_city) + # AUSTRALIA --------------------------------------------------------------- + a_df$state<- ifelse(a_df$country=="australia", + a_df$city,a_df$state) + a_df$postal_code<- ifelse(a_df$country=="australia", + a_df$city,a_df$postal_code) + a_df$city<- ifelse(a_df$country=="australia", + a_df$city2,a_df$city) + a_df$city2<- ifelse(a_df$country=="australia", + NA,a_df$city2) - # Final trimming - results[] <- lapply(results, trimws) - results$brl_city <- ifelse(results$brl_new_city==results$brl_city, - NA, - results$brl_city) - # Update `city_list` if necessary - city_list <- ifelse(is.na(city_list), results$brl_city, city_list) - # Update `state_list` if necessary - state_list<-ifelse(is.na(state_list),results$brl_state, state_list) - # Update `pc_list` if necessary - pc_list<-ifelse(is.na(pc_list),results$brl_pc, pc_list) - rm(results,city_state_mapping) + a_df$postal_code<- ifelse(a_df$country=="australia", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code) + a_df$state<- ifelse(a_df$country=="australia", + gsub("[0-9]", "", a_df$state), + a_df$state) + a_df[] <- lapply(a_df, trimws) + +# canada ------------------------------------------------------------------ + a_df$state<- ifelse(a_df$country=="canada" & nchar(a_df$city)==2, + a_df$city,a_df$state) + a_df$city<- ifelse(a_df$country=="canada" & nchar(a_df$city)==2, + NA,a_df$city) + a_df$postal_code<- ifelse(a_df$country=="canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), + a_df$city2,a_df$postal_code) + a_df$postal_code<- ifelse(a_df$country=="canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), + a_df$city2,a_df$postal_code) + a_df$state<- ifelse(a_df$country=="canada" & a_df$city2==a_df$state, + NA,a_df$state) + a_df$city<- ifelse(a_df$country=="canada",a_df$city2,a_df$city) + a_df$city2<- ifelse(a_df$country=="canada",NA,a_df$city2) + a_df$city<- ifelse(a_df$country=="canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city), + gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b","", a_df$city), + a_df$city) + a_df$state<- ifelse(a_df$country=="canada" & is.na(a_df$state), + a_df$postal_code, + a_df$state) + a_df$state<- ifelse(a_df$country=="canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$state), + gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b","", a_df$state), + a_df$state) + a_df$postal_code <- ifelse(a_df$country=="canada" & grepl("\\b(\\w{2,20})\\b \\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$postal_code), + gsub("\\b(\\w{1,2}|\\w{4,})\\b", "", a_df$postal_code), + a_df$postal_code) + a_df[] <- lapply(a_df, trimws) + #TODO: a few postal codes still have letters from city + a_df$postal_code <- ifelse(a_df$country=="canada", gsub(" ","",a_df$postal_code),a_df$postal_code) + +# UK ---------------------------------------------------------------------- + - # Handle postal codes (BR-[0-9]) - results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_state), results$brazil_state, NA) - results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_city) & is.na(results$brazil_pc), - results$brazil_city, results$brazil_pc) + uk<- c("scotland", "england", "wales","northern ireland") + pattern <- "[a-z0-9]{2,4} [a-z0-9]{3,4}" + # + # a_df$postal_code <- ifelse(a_df$country %in% uk & + # grepl(pattern, a_df$city2),a_df$city2, + # a_df$postal_code) - # Remove BR codes from city and state - results$brazil_city <- gsub("br-[0-9]+", "", results$brazil_city) - results$brazil_state <- gsub("br-[0-9]+", "", results$brazil_state) + a_df$postal_code<- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city2), + a_df$city2,a_df$postal_code) + a_df$postal_code<- ifelse(a_df$country %in% uk & grepl(pattern, a_df$state), + a_df$state,a_df$postal_code) + a_df$postal_code<- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city), + a_df$city,a_df$postal_code) - # Define city-to-state mapping - city_state_mapping <- data.frame( - city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "sao paulo"), - state = c("sp", "sp", "sp", "sp", "rj", "rj", "sp"), - stringsAsFactors = FALSE - ) + a_df$postal_code <- ifelse(a_df$country %in% uk, + ifelse(!grepl("\\d", a_df$postal_code), NA, a_df$postal_code), + a_df$postal_code) - # Match cities and states - for (i in 1:nrow(city_state_mapping)) { - results$brazil_city <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), - city_state_mapping$city[i], results$brazil_city) - results$brazil_state <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), - city_state_mapping$state[i], results$brazil_state) + a_df$city<- ifelse(a_df$country %in% uk & a_df$city==a_df$postal_code, + NA,a_df$city) + + a_df$state<- ifelse(a_df$country %in% uk & a_df$state==a_df$postal_code, + NA,a_df$state) + + + a_df$state<- ifelse(a_df$country=="england",a_df$city,a_df$state) + a_df$city<- ifelse(a_df$country=="england",NA,a_df$city) + a_df$city<- ifelse(a_df$country=="england",a_df$postal_code,a_df$city) + a_df$city<- ifelse(a_df$country=="england", + gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), + a_df$city) + + #TODO: england still needs work + + a_df$state<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales" ,NA,a_df$state) + a_df$state<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales" & is.na(a_df$state),a_df$city,a_df$state) + a_df$city<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales",a_df$postal_code,a_df$city) + a_df$city<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales" & is.na(a_df$city),a_df$city2,a_df$city) + a_df$city<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales", + gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), + a_df$city) + + +# postal codes clean uk --------------------------------------------------- + + + # Define the function + keep_numerical_parts <- function(df, control_col, country, target_col) { + # Apply the function to each row using sapply or a loop + df[[target_col]] <- sapply(1:nrow(df), function(i) { + if (df[[control_col]][i] == country) { + # Use gregexpr to find all parts of the string that include a numeral + matches <- gregexpr("\\b\\S*\\d\\S*\\b", df[[target_col]][i]) + # Extract the matched parts + result <- regmatches(df[[target_col]][i], matches) + # Combine the matched parts into a single string + result <- unlist(result) + result <- paste(result, collapse = " ") + result <- gsub(" ", "",result) + return(result) + } else { + return(df[[target_col]][i]) + } + }) + + return(df) } - # Specific replacements - results$brazil_city <- ifelse(grepl("museu nacl", results$brazil_city), - "rio de janeiro", results$brazil_city) - results$brazil_state <- ifelse(grepl("rio de janeiro", results$brazil_state, ignore.case = TRUE), - "rj", results$brazil_state) - results$brazil_state <- ifelse(grepl("sao paulo", results$brazil_state, ignore.case = TRUE), - "sp", results$brazil_state) - - # cleanup - results$brazil_city[results$brazil_city==results$brazil_state]<-NA - results$brazil_city <- trimws(results$brazil_city, which = "both") - results$brazil_state <- trimws(results$brazil_state, which = "both") - # Define words indicating this is actually a dept not state or postal code - # will use this list to delete the ones that don't apply - to_check <- c("dept","ctr","inst","ppg", - "programa", "ciencias", "unidade", "lab ") + + a_df <- keep_numerical_parts(a_df, "country","scotland", "postal_code") + a_df <- keep_numerical_parts(a_df, "country","england", "postal_code") + a_df <- keep_numerical_parts(a_df, "country","northern ireland", "postal_code") + a_df <- keep_numerical_parts(a_df, "country","wales", "postal_code") + + + - results$brazil_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brazil_city), - results$brazil_state, - results$brazil_city) - # Clean postal codes - results$brazil_pc <- gsub("[A-Za-z-]", "", results$brazil_pc) + +# india ------------------------------------------------------------------ + - # Final trimming - results[] <- lapply(results, trimws) + a_df$postal_code<- ifelse(a_df$country=="india" & grepl("[0-9]{5,10}", a_df$city2), + a_df$city2,a_df$postal_code) + a_df$postal_code<- ifelse(a_df$country=="india" & grepl("[0-9]{5,10}", a_df$city), + a_df$city,a_df$postal_code) - # Update `city_list` if necessary - city_list <- ifelse(is.na(city_list), results$brazil_city, city_list) - # Update `state_list` if necessary - state_list<-ifelse(is.na(state_list),results$brazil_state, state_list) - # Update `pc_list` if necessary - pc_list<-ifelse(is.na(pc_list),results$brazil_pc, pc_list) + a_df$city2<- ifelse(a_df$country=="india" & a_df$state==a_df$city2, + a_df$state,a_df$city2) + a_df$state<- ifelse(a_df$country=="india",NA,a_df$state) + a_df$state<- ifelse(a_df$country=="india" & is.na(a_df$postal_code), + a_df$city, a_df$state) + a_df$city<- ifelse(a_df$country=="india" & a_df$state==a_df$city, + NA, a_df$city) + a_df$city<- ifelse(a_df$country=="india" & grepl("[0-9]{4,10}", a_df$postal_code), + a_df$postal_code,a_df$city) + a_df$city<- ifelse(a_df$country=="india" & is.na(a_df$city), + a_df$city2, a_df$city) - rm(results,city_state_mapping) + a_df$postal_code<- ifelse(a_df$country=="india", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code) + a_df$city<- ifelse(a_df$country=="india", + gsub("[0-9]", "", a_df$city), + a_df$city) + + a_df$city <- ifelse(a_df$country=="india" & (grepl("delhi", a_df$city)|grepl("delhi", a_df$state)), + "new delhi", a_df$city) -# CHINA ------------------------------------------------------------------- - chn_extract <- function(x) { - if (length(x) == 1) { - return(c(NA, NA)) - } else if (x[[length(x)]] == "china") { - return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) - } else { - return(c(NA, NA)) - } - } +# china ------------------------------------------------------------------- + - chn_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) - names(chn_pc) <- c("chn_city", "chn_state") - # Define the function - extract_china_postcodes <- function(df, source_col, target_col,chn_new_city) { - # 6 digits - pattern <- "[0-9]{6}" - - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, source_col]) - # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) - # Extract the matches - extracted_codes <- regmatches(df[i, source_col], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { - df[i, target_col] <- extracted_codes[[1]][1] - df[i, chn_new_city] <- df[i, source_col] - # df[i, city_col] <- df[i, source_col] - - } - } - return(df) - } + a_df$postal_code<- ifelse(a_df$country=="china" & grepl("[0-9]{5,10}", a_df$city2), + a_df$city2,a_df$postal_code) + a_df$postal_code<- ifelse(a_df$country=="china" & grepl("[0-9]{5,10}", a_df$city), + a_df$city,a_df$postal_code) + a_df$postal_code<- ifelse(a_df$country=="china" & grepl("[0-9]{5,10}", a_df$state), + a_df$state,a_df$postal_code) - chn_pc$chn_pc<-NA - chn_pc$chn_new_city<-NA - # df, source_col, target_col,city_col - chn_pc <- extract_china_postcodes(chn_pc, "chn_city","chn_pc", "chn_new_city") - chn_pc <- extract_china_postcodes(chn_pc, "chn_state","chn_pc", "chn_new_city") + a_df$city2<- ifelse(a_df$country=="china" & a_df$state==a_df$city2, + a_df$state,a_df$city2) + a_df$state<- ifelse(a_df$country=="china",NA,a_df$state) + a_df$state<- ifelse(a_df$country=="china" & is.na(a_df$postal_code), + a_df$city, a_df$state) + a_df$city<- ifelse(a_df$country=="china" & a_df$state==a_df$city, + NA, a_df$city) + a_df$city<- ifelse(a_df$country=="china" & grepl("[0-9]{4,10}", a_df$postal_code), + a_df$postal_code,a_df$city) + a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), + a_df$city2, a_df$city) - # any without numbers gets NA'd - chn_pc$chn_pc[!grepl("\\d", chn_pc$chn_pc)] <- NA + a_df$postal_code<- ifelse(a_df$country=="china", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code) + a_df$city<- ifelse(a_df$country=="china", + gsub("[0-9]", "", a_df$city), + a_df$city) - # keep portions with numbers / remove city names - chn_pc$chn_new_city<-sub("[0-9]{6}","",chn_pc$chn_new_city) - chn_pc$chn_city<-sub("[0-9]{6}","",chn_pc$chn_city) - chn_pc$chn_state<-sub("[0-9]{6}","",chn_pc$chn_state) + + + a_df$city<- ifelse(a_df$country=="china" & grepl("beijing", a_df$state), + "beijing",a_df$city) # Define words indicating this is actually a dept not state or postal code @@ -834,1241 +679,2755 @@ authors_address <- function(addresses, ID){ "assoc","forest") + pattern <- paste(to_delete, collapse = "|") + # Apply the ifelse function to update + # a_df$city <- ifelse(a_df$country == "china" & + # grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), + # NA, a_df$city) + # + # + # a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), + # a_df$state, a_df$city) + # - chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")]<-lapply(chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")], trimws) + a_df[] <- lapply(a_df, trimws) + # Clean chn_pc1 and chn_pc2 - clean_column <- function(column, delete_terms) { - column <- gsub(paste(delete_terms, collapse = "|"), NA, column) - column <- gsub("[0-9]", "", column) # Remove digits - trimws(column) # Remove leading/trailing whitespace - } + + # TODO: check this, allows verifying iof what is in state is actually the city - # Clean chn_pc1 and chn_pc2 - chn_pc$chn_city <- clean_column(chn_pc$chn_city, to_delete) - chn_pc$chn_new_city <- clean_column(chn_pc$chn_new_city, to_delete) - chn_pc$chn_state <- clean_column(chn_pc$chn_state, to_delete) + # chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", + # "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", + # "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", + # "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", + # "gansu", "inner mongolia", "jilin", "hainan", "ningxia", + # "qinghai", "tibet", "macao") + # pattern <- paste(to_delete, collapse = "|") + # a_df$state<- ifelse(a_df$country=="china" & + # grepl(pattern, a_df$state, ignore.case = TRUE, perl = TRUE), + # NA, a_df$state) + # TODO: fix. not necessary but useful. + # All the cities in the addresses, add as needed. + # chn_cities <- unique(c((a_df$country=="china" & a_df$city), "lhasa")) - chn_pc$chn_new_city <- ifelse(is.na(chn_pc$chn_new_city), - chn_pc$chn_state, - chn_pc$chn_new_city) + +# pc is letters dash numbers ---------------------------------------------- + + pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" - chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), - NA,chn_pc$chn_state) + a_df$postal_code<- ifelse(grepl(pattern, a_df$city), + a_df$city,a_df$postal_code) + a_df$postal_code<- ifelse(grepl(pattern, a_df$state), + a_df$state,a_df$postal_code) - chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_city)==FALSE), - chn_pc$chn_city,chn_pc$chn_state) + # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city==a_df$postal_code), + # a_df$postal_code,a_df$state) + a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$postal_code), + a_df$city,a_df$state) - chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), - NA,chn_pc$chn_state) + a_df$city<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$postal_code), + a_df$postal_code,a_df$city) - chn_pc$chn_state <- gsub(" province", "",chn_pc$chn_state) + a_df$city2<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$city), + NA,a_df$city2) - # Update `city_list` if necessary - city_list <- ifelse(is.na(city_list), chn_pc$chn_new_city, city_list) + # # + # # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$state), + # # NA,a_df$state) + # # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city==a_df$state), + # # NA,a_df$state) + # # + # a_df$city<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$city), + # a_df$city2,a_df$city) - # Update `state_list` if necessary - state_list<-ifelse(is.na(state_list),chn_pc$chn_state, state_list) - # Update `pc_list` if necessary - pc_list<-ifelse(is.na(pc_list),chn_pc$chn_pc, pc_list) + a_df$city<- ifelse(grepl(pattern, a_df$city), + gsub("[0-9]","",a_df$city), + a_df$city) + a_df$city<- gsub("[a-z]{1,2}- ","", a_df$city) - rm(chn_pc) -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# ### China has some where the postal code is with the city, so fix those here -# # Extract postal code information from the list -# chn_extract <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) -# } else if (x[[length(x)]] == "china") { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) -# } else { -# return(c(NA, NA)) -# } -# } - -# Apply extraction to list_address -# chn_missing_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) -# names(chn_missing_pc) <- c("chn_pc1", "chn_pc2") -# -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", -# "dept", "div", "univ", "hosp", "coll", "sci", "rd","program" -# "minist", "educ", "sch ", "grad ", "fac ","assoc") -# -# -# -# # Extract numeric postal codes -# chn_missing_pc$chn_pc_from1 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc1)) -# chn_missing_pc$chn_pc_from2 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc2)) -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# clean_column <- function(column, delete_terms) { -# column <- gsub(paste(delete_terms, collapse = "|"), NA, column) -# column <- gsub("[0-9]", "", column) # Remove digits -# trimws(column) # Remove leading/trailing whitespace -# } -# -# # Clean chn_pc1 and chn_pc2 -# chn_missing_pc$chn_pc1 <- clean_column(chn_missing_pc$chn_pc1, to_delete) -# chn_missing_pc$chn_pc2 <- clean_column(chn_missing_pc$chn_pc2, to_delete) -# -# # Initialize empty columns for final outputs -# chn_missing_pc$chn_pc <- NA -# chn_missing_pc$chn_city <- NA -# chn_missing_pc$chn_state <- NA -# -# # Assign postal codes, cities, and states based on conditions -# assign_chn_data <- function(from1, from2, pc1, pc2) { -# list( -# chn_pc = ifelse(is.na(from1) & !is.na(from2), from2, from1), -# chn_city = ifelse(is.na(from1) & !is.na(from2), pc2, pc1), -# chn_state = ifelse(is.na(from1) & !is.na(from2), pc1, pc2) -# ) -# } -# -# chn_result <- assign_chn_data(chn_missing_pc$chn_pc_from1, -# chn_missing_pc$chn_pc_from2, -# chn_missing_pc$chn_pc1, -# chn_missing_pc$chn_pc2) -# -# chn_missing_pc$chn_pc <- chn_result$chn_pc -# chn_missing_pc$chn_city <- gsub("[0-9]", "", chn_result$chn_city) -# chn_missing_pc$chn_state <- gsub("[0-9]", "", chn_result$chn_state) - -# # Define Chinese states and cities -# chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", -# "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", -# "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", -# "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", -# "gansu", "inner mongolia", "jilin", "hainan", "ningxia", -# "qinghai", "tibet", "macao") -# -# # All the cities in the addresses, add as needed. -# chn_cities <- unique(c(chn_missing_pc$chn_city, "lhasa")) - -# Update states and cities based on matching conditions -# chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc1 %in% chn_states, -# chn_missing_pc$chn_pc1, chn_missing_pc$chn_state) -# chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc2 %in% chn_states, -# chn_missing_pc$chn_pc2, chn_missing_pc$chn_state) -# -# chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc1 %in% chn_states), -# chn_missing_pc$chn_pc1, chn_missing_pc$chn_city) -# chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc2 %in% chn_states), -# chn_missing_pc$chn_pc2, chn_missing_pc$chn_city) -# -# # put the postal codes and cities in the pc_list, state_list -# pc_list<-ifelse(is.na(pc_list),chn_missing_pc$chn_pc, pc_list) -# city_list<-ifelse(is.na(city_list),chn_missing_pc$chn_city, city_list) -# state_list<-ifelse(is.na(state_list),chn_missing_pc$chn_state, state_list) -# -# -# rm(chn_cities,chn_states,to_delete,chn_result,chn_missing_pc) - - - - - - - - - -# UK ------------------------------------------------------------------ - -process_uk_address <- function(x) { - if (length(x) == 1) { - return(c(NA, NA)) # Not uk - } - if ((x[[length(x)]] == "england")| - (x[[length(x)]] == "scotland")| - (x[[length(x)]] == "wales")| - (x[[length(x)]] == "northern ireland")) { - if (length(x) == 3) { - return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) - } else { - return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) - } - } + # a_df$postal_code<- gsub("[a-z]","", a_df$postal_code) - return(c(NA, NA)) # Not uk -} - -# Apply the function across all addresses using `mapply` -results <- t(mapply(process_uk_address, list_address)) -colnames(results) <- c("uk_city", "uk_state") - - - - -# Clean up results -results <- as.data.frame(results, stringsAsFactors = FALSE) -results$uk_city[results$uk_city == "not uk"] <- NA -results$uk_state[results$uk_state == "not uk"] <- NA - - - -results$uk_pc<-NA - -# Define the function -extract_uk_postcodes <- function(df, source_col, target_col,city_col) { - # Regular expression pattern for UK postal codes - # One or two initial letters. - # One or two digits (and possibly a letter). - # A mandatory space. - # One digit followed by two letters. - pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" - pattern2 <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" - pattern3 <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" - - - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, source_col]) - # Extract the matches - extracted_codes <- regmatches(df[i, source_col], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { - df[i, target_col] <- extracted_codes[[1]][1] - df[i, city_col] <- df[i, source_col] - - } - } - return(df) -} - -# Example usage - -results <- extract_uk_postcodes(results, "uk_city","uk_pc", "uk_city") -results <- extract_uk_postcodes(results, "uk_state","uk_pc", "uk_city") - -# any without numbers gets NA'd -results$uk_pc[!grepl("\\d", results$uk_pc)] <- NA - - - -results$new_city<-NA - - -# Define the function -uk_city_id <- function(df, source_col, target_col) { - # Regular expression pattern for UK postal codes - # One or two initial letters. - # One or two digits (and possibly a letter). - # A mandatory space. - # One digit followed by two letters. - pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" - - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, source_col]) - # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) - # Extract the matches - extracted_codes <- regmatches(df[i, source_col], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { - df[i, target_col] <- df[i, source_col] - } - } - return(df) -} - -# Example usage - -results <- uk_city_id(results, "uk_city","new_city") -results <- uk_city_id(results, "uk_state","new_city") - - -results$new_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$new_city) - - - - -# Define the function -uk_city_id <- function(df, source_col, target_col) { - # Regular expression pattern for UK postal codes - # One or two initial letters. - # One or two digits (and possibly a letter). - # A mandatory space. - # One digit followed by two letters. - pattern <- "\\s[A-Za-z0-9]{2,4}\\s[A-Za-z0-9]{2,4}" - - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, source_col]) - # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) - # Extract the matches - extracted_codes <- regmatches(df[i, source_col], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { - df[i, target_col] <- df[i, source_col] - } - } - return(df) -} - -# Example usage -results <- uk_city_id(results, "uk_state","new_city") - - - -results$uk_state<-ifelse(results$uk_state==results$uk_city, - "",results$uk_state) - -results$new_city<-ifelse(is.na(results$new_city), - results$uk_city, - results$new_city) -# remove zip codes from new city -results$new_city<-gsub("\\s[A-Za-z0-9]{3}\\s[A-Za-z0-9]{3}","",results$new_city) -results$new_city<-gsub("\\s[A-Za-z0-9]{4}\\s[A-Za-z0-9]{2,3}","",results$new_city) - - -results$new_city<-ifelse(results$uk_state=="london", - "london", - results$new_city) - - -results$uk_state<-ifelse(results$uk_state=="london", - NA, - results$uk_state) - - - - - - - - - - -# keep portions with numbers / remove city names -# results$uk_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_city) -# results$uk_state<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_state) -# results$uk_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$uk_pc) - -# results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr - -results$uk_pc[results$uk_pc == ""] <- NA -# now remove any PC from city - -# Update `city_list` if necessary -city_list <- ifelse(is.na(city_list), results$new_city, city_list) - -# Update `state_list` if necessary -state_list<-ifelse(is.na(state_list),results$uk_state, state_list) - -# Update `pc_list` if necessary -pc_list<-ifelse(is.na(pc_list),results$uk_pc, pc_list) - -rm(results) - - - - - - - - - - - - - - - - - - - - -# Extracts postal code when combined with city name ----------------------- - -city_list <- trimws(city_list, which = "both") - - - - city_clean<-data.frame( - authorID=ID, - addresses=addresses, - original_city=city_list, - city_list=city_list, - state_list=state_list, - country_list=country_list, - extract_pc=pc_list) - - - # # England, Scotland, Wales --------------------------------------------- - # - # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc)& (city_clean$country_list=="england"| - # city_clean$country_list=="wales" | - # city_clean$country_list=="scotland"), - # gsub(".*\\s([a-z0-9]+\\s[a-z0-9]+)$", "\\1", city_clean$city_list), - # city_clean$extract_pc) - # - # # This then deletes the postal code from the city name - # city_clean$city_list<-ifelse((city_clean$country_list=="england"| - # city_clean$country_list=="wales" | - # city_clean$country_list=="scotland"), - # (gsub("([A-Za-z0-9]+\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), - # city_clean$city_list) - # - city_clean[city_clean == ""] <- NA + a_df$city<- gsub("[-]","", a_df$city) + a_df[] <- lapply(a_df, trimws) - # Define the function - extract_postcodes <- function(df, source_col, target_col) { - # One or two initial letters. - # mandatory dash - # several numbers - pattern <- "\\b[A-Za-z]{1,2}[-][0-9]{4,8}\\b" - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, source_col]) - # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) - # Extract the matches - extracted_codes <- regmatches(df[i, source_col], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { - df[i, target_col] <- extracted_codes[[1]][1] - } - } - return(df) - } - # Example usage + pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" + + a_df$postal_code<- ifelse(grepl(pattern, a_df$postal_code), + gsub("[a-z]","",a_df$postal_code), + a_df$postal_code) + + a_df$postal_code<- gsub("[-]","", a_df$postal_code) + a_df[] <- lapply(a_df, trimws) - city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") +# final postal codes - consecutinve numbers ------------------------------- # Define the function - extract_postcodes <- function(df, source_col, target_col) { - # One or two initial letters. - # mandatory dash - # several numbers - pattern <- " [0-9]{3,9}" - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, source_col]) - # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) - # Extract the matches - extracted_codes <- regmatches(df[i, source_col], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { - df[i, target_col] <- extracted_codes[[1]][1] - } - } + extract_consecutive_numbers <- function(df, source, destination) { + df[[destination]] <- sapply(1:nrow(df), function(i) { + # Use gregexpr to find sequences of 4 or more consecutive numbers + if (is.na(df[[destination]][i])) { + matches <- gregexpr("\\d{4,}", df[[source]][i]) + # Extract the matched sequences + result <- regmatches(df[[source]][i], matches) + # Flatten the list of matches into a character vector + result <- unlist(result) + # Combine the matched sequences into a single string + result <- paste(result, collapse = " ") + return(result) + } else { + return(df[[destination]][i]) + } + }) return(df) } + a_df <- extract_consecutive_numbers(a_df, "state","postal_code") - # Example usage - city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") + +# clean the city ---------------------------------------------------------- + + a_df$city<- gsub("[0-9]","", a_df$city) - # Define the function - delete_matching_text <- function(df, col_a, col_b) { - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Check if the value in column A is not NA and is found within the text in column B - if(!is.na(df[i, col_a]) && grepl(df[i, col_a], df[i, col_b])) { - # Remove the matching text from column B by replacing it with an empty string - df[i, col_b] <- gsub(df[i, col_a], "", df[i, col_b]) - } - } - return(df) - } - city_clean <- delete_matching_text(city_clean, "extract_pc","city_list") + +# clean up postal code ---------------------------------------------------- + - city_clean <- delete_matching_text(city_clean, "extract_pc","state_list") + a_df$postal_code<- ifelse(grepl("\\b[a-zA-Z]+\\s+[0-9]+\\b", a_df$postal_code), + gsub("\\b[a-zA-Z]+\\s","", a_df$postal_code), + a_df$postal_code) - # remove state if same as city - city_clean$state_list<-ifelse(city_clean$state_list==city_clean$city_list,NA, city_clean$state_list) - # there are some usa ones that had state zip buyt no country - # Define the function - extract_postcodes <- function(df, country_list, extract_pc, state_list) { - # One or two initial letters. - # mandatory dash - # several numbers - pattern <- "\\b[A-Za-z]{2} [0-9]{5}\\b" - # Loop through each row of the dataframe - for(i in 1:nrow(df)) { - # Find all matches of the pattern in the source column - matches <- gregexpr(pattern, df[i, country_list]) - # Extract the matches - extracted_codes <- regmatches(df[i, country_list], matches) - # If there's at least one match and the target column is NA, copy the first match to the target column - if(length(extracted_codes[[1]]) > 0 && is.na(df[i, extract_pc])) { - df[i, extract_pc] <- extracted_codes[[1]][1] - df[i, state_list] <- extracted_codes[[1]][1] - df[i, country_list] <- "usa" - } - } - return(df) - } - # Example usage - city_clean <- extract_postcodes(city_clean, - "country_list", "extract_pc", "state_list") - # remove zip codes from states - city_clean$state_list<-ifelse(city_clean$country_list=="usa", - gsub("[0-9]","",city_clean$state_list), - city_clean$state_list) - # remove state from zipcode - city_clean$extract_pc<-ifelse(city_clean$country_list=="usa", - gsub("[a-z]","",city_clean$extract_pc), - city_clean$extract_pc) + a_df$city<-ifelse(a_df$city=="university pk", + "university park", + a_df$city) + a_df$city<-ifelse(a_df$city=="gavea rio de janeiro", + "rio de janeiro", + a_df$city) + a_df$city<-ifelse(a_df$city=="college stn", + "college station", + a_df$city) + a_df$city<-ifelse(a_df$city=="n chicago", + "north chicago", + a_df$city) + a_df$city<-ifelse(a_df$city=="college pk", + "college park", + a_df$city) + a_df$city<-ifelse(a_df$city=="research triangle pk" | a_df$city=="res triangle pk", + "research triangle park", + a_df$city) + a_df$city<-ifelse(a_df$city=="state coll", + "state college", + a_df$city) + # city corrections + a_df$city<-ifelse((a_df$city=="dehra dun" & a_df$country == "india"), + "dehradun", + a_df$city) + a_df$city<-ifelse((a_df$city=="st john" & a_df$country == "canada"), + "st. john's", + a_df$city) + a_df$state<-ifelse(a_df$state=="london ", + "london", + a_df$state) + a_df$city<-ifelse((a_df$state=="london" & a_df$country == "england"), + "london", + a_df$city) + a_df$city<-ifelse((a_df$country=="brazil" & a_df$city == "s jose campos"), + "sao jose dos campos", + a_df$city) + a_df$city<-ifelse((a_df$country=="brazil" & (a_df$city == "rio de janerio"| + a_df$city == "rio de janiero"| + a_df$city == "rio der janeiro"| + a_df$city == "rio janiero")), + "rio de janeiro", + a_df$city) + # + # a_df$state<-ifelse(a_df$state=="london", + # NA, + # a_df$state) + a_df$city<-ifelse(a_df$country=="mexico" & a_df$city == "df", + "mexico city", + a_df$city) + a_df$city<-ifelse(a_df$country=="argentina" & a_df$city == "df", + "buenos aires",a_df$city) + +} + # usa cz = canal zone + # scotland mrc + # usa ada, apo, dpo + # st for saint (but cant change forest or west) + # netheerlands first two letters of city names xx name + # ste for saint + # + # country<- a_df %>% + # # mutate(city_match=(city==second_tier)) %>% + # # filter(city_match==FALSE) %>% + # distinct(country) %>% + # mutate(summary=nchar(country)) %>% + # arrange(country) + # + # + # country_city<- a_df %>% + # # mutate(city_match=(city==second_tier)) %>% + # # filter(city_match==FALSE) %>% + # distinct(country,city) %>% + # mutate(city_char=nchar(city)) %>% + # arrange(country,city) + # + # + # + # country_state<- a_df %>% + # # mutate(city_match=(city==second_tier)) %>% + # # filter(city_match==FALSE) %>% + # distinct(country,state) %>% + # mutate(city_char=nchar(state)) %>% + # arrange(country,state) + # + # country_state_city<- a_df %>% + # # mutate(city_match=(city==second_tier)) %>% + # # filter(city_match==FALSE) %>% + # distinct(country ,state,city) %>% + # mutate(city_char=nchar(city)) %>% + # arrange(country,state,city) + # + # + # country_state_city_pc<- a_df %>% + # # mutate(city_match=(city==second_tier)) %>% + # # filter(city_match==FALSE) %>% + # distinct(country ,state,postal_code,city) %>% + # mutate(city_char=nchar(city)) %>% + # arrange(country,state,postal_code,city) - city_clean$extract_pc <- trimws(gsub("&", "", city_clean$extract_pc)) - city_clean[city_clean == ""] <- NA + +# old code ---------------------------------------------------------------- + # # # # +# last_element %in% c("india", "china", +# # -head(city_clean) -# -# # Cleaning up city names with zip codes in them -# # take the zip code and put it in a new column before deleting +# a_df$state<- ifelse(a_df$state==a_df$city2,NA,a_df$state) # -# # -# -# # 2) Countries with postal code AFTER city -------------------------------- -# # -# city_zip<-c("ireland","qatar","kazakhstan","peru","turkey","south korea", -# "japan","costa rica","mexico","new zealand","iran","thailand", -# "russia","spain","india","singapore","indonesia","chile", -# "finland","colombia","taiwan","saudi arabia","uruguay", -# "slovenia","spain") +# ## Australia postal codes also separated +# # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) # +# # First need to "fix" the city +# # Function to check for three characters or numbers in the city-list and replace with NA +# process_aus_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not Australia +# } +# +# if (x[[length(x)]] == "australia") { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } # +# return(c(NA, NA)) # Not Australia +# } # -# city_clean$extract_pc<-ifelse((is.na(city_clean$extract_pc)& (city_clean$country_list %in% city_zip)), -# (gsub(".*[A-Za-z]+", "", city_clean$city_list)), -# city_clean$extract_pc) -# # -# city_clean$city_list<-ifelse((city_clean$country_list %in% city_zip), -# gsub("\\s([0-9]+)", "", city_clean$city_list), -# city_clean$city_list) +# # Apply the function across all addresses using `mapply` +# results <- t(mapply(process_aus_address, list_address)) +# colnames(results) <- c("aus_city", "aus_state") # -# city_clean[city_clean == ""] <- NA +# # Clean up results +# results <- as.data.frame(results, stringsAsFactors = FALSE) +# results$aus_city[results$aus_city == "not australia"] <- NA +# results$aus_state[results$aus_state == "not australia"] <- NA # -# -# # 3) Postal code and dash BEFORE city name -------------------------------- -# -# -# zip_dash<-c("finland","slovakia","austria","portugal","belgium", -# "spain","israel","czech republic","argentina","france", -# "sweden","switzerland","turkey","germany","italy", -# "lithuania","hungary","denmark","poland","norway", "iceland", -# "greece", "ukraine","estonia","latvia","luxembourg","lativa", -# "south africa","bulgaria","brazil") -# -# +# # take the PC+state and assign to PC +# results$aus_pc<-results$aus_state +# # remove all digits from state +# results$aus_state <- trimws(gsub("\\d", "", results$aus_state)) +# # remove all letters from pc +# results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) +# results$aus_pc[results$aus_pc == ""] <- NA +# +# # if na in PC, assign the city (some of which have PC) +# results$aus_pc <- ifelse(is.na(results$aus_pc), results$aus_city, results$aus_pc) +# # remove all metters from pc, leaving any new pc +# results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) +# results$aus_pc[results$aus_pc == ""] <- NA +# # now remove any PC from city +# results$aus_city <- trimws(gsub("\\d", "", results$aus_city)) #alternative +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$aus_city, city_list) +# +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$aus_state, state_list) +# +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$aus_pc, pc_list) +# +# rm(results) +# +# +# +# # CANADA ------------------------------------------------------------------ +# +# ## Canada postal codes also separated +# # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) +# +# process_can_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not Canada +# } # +# if (x[[length(x)]] == "canada") { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } # -# city_clean$extract_pc <- ifelse((is.na(city_clean$extract_pc) & -# (city_clean$country_list %in% zip_dash)), -# # gsub("\\s([A-Za-z]+)", "", city_clean$city_list), -# sub(" .*", "", city_clean$city_list), -# city_clean$extract_pc) +# return(c(NA, NA)) # Not canada +# } +# +# # Apply the function across all addresses using `mapply` +# results <- t(mapply(process_can_address, list_address)) +# colnames(results) <- c("can_city", "can_state") +# +# # Clean up results +# results <- as.data.frame(results, stringsAsFactors = FALSE) +# results$can_city[results$can_city == "not canada"] <- NA +# results$can_state[results$can_state == "not canada"] <- NA +# +# # take the PC+state and assign to PC +# results$can_pc<-results$can_state +# +# # any without numbers gets NA'd +# results$can_pc[!grepl("\\d", results$can_pc)] <- NA +# # removes state and removes ltr at start of PC +# results$can_pc<-sub("[a-z]\\s+[a-z]", "", results$can_pc) #+[a-z] +# +# +# # if na in PC, assign the city (some of which have PC) +# results$can_pc <- ifelse(is.na(results$can_pc), results$can_city, results$can_pc) +# results$can_pc[!grepl("\\d", results$can_pc)] <- NA +# +# # keep portions with numbers / remove city names +# # .*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$: This regex pattern matches any +# # characters (.*) followed by a word boundary (\\b) and exactly three word +# # characters (\\w{3}), capturing this as the first group ((\\w{3})). It +# # then matches any characters again (.*) followed by another word boundary +# # and exactly three word characters, capturing this as the second +# # group ((\\w{3})), and ensures this is at the end of the string ($). +# # sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", string): Replaces the +# # entire string with the two captured groups separated by a space. +# results$can_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$can_pc) +# +# # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr +# +# results$can_pc[results$can_pc == ""] <- NA +# # now remove any PC from city +# results$can_city<-sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", results$can_city) # all after first space +# +# +# # fix state +# results$can_state <- trimws(gsub("\\d", "", results$can_state)) +# results$can_state <-trimws(sub(" .*", "", results$can_state)) # all after first space +# results$can_state <- gsub("british", "bc", results$can_state) +# results$can_state <- gsub("nova", "ns", results$can_state) +# # fix city +# results$can_city <- trimws(gsub("\\d", "", results$can_city)) +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$can_city, city_list) +# +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$can_state, state_list) +# +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$can_pc, pc_list) +# +# rm(results) +# +# # INDIA ------------------------------------------------------------------ +# +# # India states are almost always listed but New Delhi is complicated, +# # as are any with only three entries +# # Function to process addresses for both city and state +# process_india_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not India +# } # +# if (x[[length(x)]] == "india") { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } # -# city_clean$city_list<-ifelse((city_clean$country_list %in% zip_dash), -# gsub("[a-zA-Z]+-[0-9]+", "", city_clean$city_list), -# city_clean$city_list) +# if (length(x) != 3 && grepl("delhi", x[[length(x) - 1]])) { +# return(c(trimws(x[[length(x) - 1]]), NA)) +# } # +# return(c(NA, NA)) # Not India +# } # -# city_clean[city_clean == ""] <- NA -# -# # 4) Netherlands Postal Code ---------------------------------------------- -# -# # Netherlands has Postal Code before -# # it is a combination of 2-3 blocks of letters and numbers -# city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="netherlands"), -# (gsub("^(([^ ]+ ){2}).*", "\\1", city_clean$city_list)), -# city_clean$extract_pc) -# city_clean$city_list<-ifelse((city_clean$country_list=="netherlands"), -# (gsub(".*\\s", "", city_clean$city_list)), -# city_clean$city_list) -# city_clean[city_clean == ""] <- NA -# -# # 5) Venezuela ----------------------------------------------------------- -# -# # Venezuela has postal code after, it is combo of letters and numbers -# city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="venezuela"), -# gsub(".*\\s([a-z0-9]+)$", "\\1", city_clean$city_list), -# city_clean$extract_pc) +# # Apply the function across all addresses using `mapply` +# results <- t(mapply(process_india_address, list_address)) +# colnames(results) <- c("india_city", "india_state") # -# # This then deletes the postal code from the city name -# city_clean$city_list<-ifelse(city_clean$country_list=="venezuela", -# (gsub("(\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), -# city_clean$city_list) -# city_clean[city_clean == ""] <- NA +# # Clean up results +# results <- as.data.frame(results, stringsAsFactors = FALSE) +# results$india_city[results$india_city == "not india"] <- NA +# results$india_state[results$india_state == "not india"] <- NA # +# # Remove numeric parts from state names and trim whitespace +# results$india_state <- trimws(gsub("\\s([0-9]+)", "", results$india_state)) # +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$india_city, city_list) +# #### # -# # trim ws -# city_clean$extract_pc <- trimws(city_clean$extract_pc, which = "both") -# city_clean$city_list <- trimws(city_clean$city_list, which = "both") -# # This removes any that don't have numbers in them -# city_clean$extract_pc[!grepl("[0-9]", city_clean$extract_pc)] <- NA -# city_clean$city_list <- gsub("[0-9]+", "", city_clean$city_list) +# state_list<-ifelse(is.na(state_list),results$india_state, state_list) # +# rm(results) # -# Final Clean Up ---------------------------------------------------------- - - # Russia - city_clean$city_list<-ifelse(city_clean$country_list=="russia", - (gsub("st Petersburg p", "st petersburg", city_clean$city_list)), - city_clean$city_list) - - - - # India - India_delete<- c("dept") - - city_clean$city_list <- ifelse(city_clean$country_list=="india", - gsub(paste(India_delete, collapse = "|"), NA, city_clean$city_list), - city_clean$city_list) - city_clean$city_list<-trimws(city_clean$city_list) # Remove leading/trailing whitespace - - # brazil - city_clean$extract_pc<-ifelse((city_clean$country_list=="brazil" & nchar(city_clean$extract_pc) < 5), - "",city_clean$extract_pc) - city_clean[city_clean == ""] <- NA - - brazil_delete<- c("barretos canc hosp","univ fed sao paulo", - "escola filosofia letras & ciencias humanas", - "hosp sirio libanes","perola byington hosp", - "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", - "lab","zoologia", "inst", "programa","ppg", "ppg") - - - city_clean$city_list <- ifelse(city_clean$country_list=="brazil", - gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), - city_clean$city_list) - - city_clean$city_list<-trimws(city_clean$city_list, which="both") # Remove leading/trailing whitespace - city_clean$state_list<-trimws(city_clean$state_list, which="both") # Remove leading/trailing whitespace - city_clean$country_list<-trimws(city_clean$country_list, which="both") # Remove leading/trailing whitespace) # Remove leading/trailing whitespace - # City Abbreviations - - city_clean$city_list<-ifelse(city_clean$city_list=="university pk", - "university park", - city_clean$city_list) - - - city_clean$city_list<-ifelse(city_clean$city_list=="n chicago", - "north chicago", - city_clean$city_list) - - city_clean$city_list<-ifelse(city_clean$city_list=="college pk", - "college park", - city_clean$city_list) - - city_clean$city_list<-ifelse(city_clean$city_list=="research triangle pk", - "research triangle park", - city_clean$city_list) - - city_clean$city_list<-ifelse(city_clean$city_list=="state coll", - "state college", - city_clean$city_list) - - # city corrections - city_clean$city_list<-ifelse((city_clean$city_list=="dehra dun" & city_clean$country_list == "india"), - "dehradun", - city_clean$city_list) - - city_clean$city_list<-ifelse((city_clean$city_list=="st john" & city_clean$country_list == "canada"), - "st. john's", - city_clean$city_list) - - city_clean$state_list<-ifelse(city_clean$state_list=="london ", - "london", - city_clean$state_list) - - city_clean$city_list<-ifelse((city_clean$state_list=="london" & city_clean$country_list == "england"), - "london", - city_clean$city_list) - - - city_clean$state_list<-ifelse(city_clean$state_list=="london", - NA, - city_clean$state_list) - - - - city_list<-city_clean$city_list - state_list<-city_clean$state_list - pc_list<-city_clean$extract_pc - country_list<-city_clean$country_list - - - rm(brazil_delete,India_delete) - - - # rm(city_clean) - - - pc_list[pc_list == ""] <- NA - city_list[city_list == ""] <- NA - state_list[state_list == ""] <- NA - dept_list[dept_list == ""] <- NA - country_list[country_list == ""] <- NA - # Create the df that will be returned - cleaned_ad<-data.frame(ID, - addresses, - university_list, - dept_list, - city_list, - country_list, - state_list, - pc_list) - - - - # names(cleaned_ad) - - - list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) - - # Because formats of address printing is different across platforms - # We are going to split using a tier system assuming first and last - # info is somewhat reliable and guess the other info from the - # remaining position of the info - - second_tier_list <- lapply(list_address1, function(x) x[length(x)]) - second_tier_list <- trimws(second_tier_list, which = "both") - second_tier_list[second_tier_list == "character(0)"] <- NA - - list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) - - third_tier_list <- lapply(list_address2, function(x) x[length(x)]) - third_tier_list <- trimws(third_tier_list, which = "both") - third_tier_list[third_tier_list == "character(0)"] <- NA - - # All remaining info is just shoved in this category - remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) - remain_list <- trimws(remain_list, which = "both") - remain_list[remain_list == "character(0)"] <- NA - - - # original - # a_df <- data.frame( - # adID = ID, university = university_list, - # country = country_list, - # state = state_list, postal_code = pc_list, city = NA, - # department = NA, second_tier = second_tier_list, - # third_tier = third_tier_list, - # remain = remain_list, address = addresses, - # stringsAsFactors = FALSE - # ) - - # EB EDIT - a_df_1 <- data.frame( - adID = ID, - university_1 = university_list, - university = university_list, - country_1 = country_list, - country = country_list, - state_1 = state_list, - state = state_list, - postal_code_1 = pc_list, - postal_code = pc_list, - city_1 = city_list, - city = city_list, - department_1 = dept_list, - department = dept_list, - second_tier = second_tier_list, - third_tier = third_tier_list, - remain = remain_list, - address = addresses, - stringsAsFactors = FALSE - ) - - a_df_1$city_list<-ifelse(is.na(a_df_1$city_1), - a_df_1$city, - a_df_1$city_1) - - a_df_1$postal_code_1<-ifelse(is.na(a_df_1$postal_code_1), - a_df_1$postal_code, - a_df_1$postal_code_1) - - - # Quebec Postal Code is PQ (Fr) or QC (En) but only QC is georeferenced - a_df_1$state_1<-ifelse(a_df_1$state_1=="pq" & a_df_1$country_1 == "canada", - "qc", - a_df_1$state_1) - - - - - - - - - - - - - - - - - - -a_df<-a_df_1 - -rm(a_df_1) - - - - # try to fix the usa spots, which vary in format than other countries - a_df$state<-ifelse(is.na(a_df$state),"",a_df$state) # added by eb to deal with index problem - a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0] - a_df$state[nchar(a_df$state) == 0] <- NA - a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA - a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) & - !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) & - !is.na(a_df$postal_code) & !is.na(a_df$state)] - # fix a US problem when usa is not tacked onto the end - - us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}" - a_df$state[ grepl(us_reg, a_df$country) ] <- - substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2) - - a_df$postal_code[ grepl(us_reg, a_df$country) ] <- - substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8) - - a_df$country[grepl(us_reg, a_df$country)] <- "usa" - - - a_df$state_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", - a_df$state, - a_df$state_1) - - a_df$postal_code_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", - a_df$postal_code, - a_df$postal_code_1) - - - a_df$country_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", - a_df$country, - a_df$country_1) - - - ########################## - # We'll use regular expression to pull zipcodes - # These formats differ by region - int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}" - int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]", - "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="") - int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}" - int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" - int <- paste(int1, int2, int3, int4, sep = "|") - - uk <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]", - "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="") - - mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well - - panama <- "[[:digit:]]{4}-[[:digit:]]{5}" - - zip_search <- paste0(int, "|", uk, "|", mexico, "|", panama) - - - - - - # ADDRD EB INSTEAD OF - a_df$city_1 <- ifelse(is.na(a_df$city_1),a_df$third_tier,a_df$city_1) - a_df$state_1 <- ifelse(is.na(a_df$state_1),a_df$second_tier,a_df$state_1) - a_df$postal_code_1 <- ifelse(is.na(a_df$postal_code_1),a_df$second_tier,a_df$postal_code_1) - - - # fix country - usa - # Function to remove everything before " usa" - remove_before_usa <- function(x) { - if (grepl(" usa", x)) { - return(sub(".*(?= usa)", "", x, perl = TRUE)) - } else { - return(x) - } - } - - # Apply the function to each element in the vector - a_df$country_1 <- sapply(a_df$country_1, remove_before_usa) - a_df$country_1 <- trimws(a_df$country_1, which = "both") - - - a_df$state_1 <- ifelse(a_df$country_1=="usa", - (trimws(gsub("[0-9]", "", a_df$state_1), which="both")), - a_df$state_1) - - a_df$postal_code_1 <- ifelse(a_df$country_1=="usa", - (trimws(gsub("[a-z]", "", a_df$postal_code_1), which="both")), - a_df$postal_code_1) - - - - ########################### - id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) & - a_df$address != "Could not be extracted"] - ########################### - - # We now iteratively run through the addresses using the concept that - # certain information always exists next to each other. - # Ex. city, state, country tend to exist next to each other. - # We use the position of the zipcode also to help guide us - # in where the information lies as well as how many fields were - # given to us. - for (i in id_run) { - found <- FALSE - row <- which(a_df$adID == i) - university <- a_df$university[row] - second_tier <- a_df$second_tier[row] - third_tier <- a_df$third_tier[row] - remain <- a_df$remain[row] - city <- a_df$city[row] - state <- a_df$state[row] - postal_code <- a_df$postal_code[row] - department <- a_df$department[row] - grepl(zip_search, second_tier) - grepl(zip_search, third_tier) - # 2nd tier - if (grepl(zip_search, second_tier)) { - found <- TRUE - postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier)) - city <- gsub(zip_search, "", second_tier) - department <- ifelse(is.na(remain), third_tier, remain) - } - # 3RD tiers - if (grepl(zip_search, third_tier) & !found) { - found <- TRUE - postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier)) - city <- gsub(zip_search, "", third_tier) - state <- second_tier - department <- remain - } - - if (!found) { - state <- second_tier - city <- third_tier - department <- remain - } - # To make university searching more efficient we'll override values - # based on if it has university/college in the name, - # where university overides college - override_univ <- grepl("\\buniv\\b|\\buniversi", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) & - !grepl("\\bdrv\\b|\\bdrive\\b", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) - - if (any(override_univ)) { - university <- - c(second_tier, third_tier, remain, city, university)[override_univ][1] - assign( - c("second_tier", "third_tier", "remain", "city", "university")[ - override_univ][1], - NA - ) - } - # only if university doesnt already exist - override_univ_col <- - grepl("\\bcol\\b|college|\\bcoll\\b", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) & - !grepl("\\bdrv\\b|\\bdrive\\b", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) - - if (!any(override_univ) & any(override_univ_col)) { - university <- - c(second_tier, third_tier, remain, city, university )[ - override_univ_col][1] - - assign( - c("second_tier", "third_tier", "remain", "city", "university")[ - override_univ_col][1], - NA - ) - } - # more risky, but institutions as well, just incase its not a university - override_univ_inst <- grepl("\\binst\\b|\\binstitut", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) - if ( - !any(override_univ) & !any(override_univ_col) & any(override_univ_inst) - ) { - department <- c(second_tier, third_tier, remain, city, university )[ - override_univ_inst][1] - - assign( - c("second_tier", "third_tier", "remain", "city", "university")[ - override_univ_inst][1], - NA - ) - } - - a_df$city[row] <- gsub("[[:digit:]]", "", city) - a_df$state[row] <- gsub("[[:digit:]]", "", state) - a_df$postal_code[row] <- postal_code - a_df$department[row] <- department - - - - #########################Clock############################### - total <- length(id_run) - pb <- utils::txtProgressBar(min = 0, max = total, style = 3) - utils::setTxtProgressBar(pb, which(id_run == i)) - ############################################################# - } - - - city_fix <- is.na(a_df$city) & !is.na(a_df$state) - a_df$city[city_fix] <- a_df$state[city_fix] - a_df$state[city_fix] <- NA - a_df$university[a_df$university == "Could not be extracted"] <- NA - a_df$country[a_df$country == "Could not be extracted"] <- NA - # a_df$country[a_df$country == "peoples r china"] <- "China" - # a_df$country[a_df$country == "U Arab Emirates"] <- "United Arab Emirates" - # a_df$country[a_df$country == "Mongol Peo Rep"] <- "Mongolia" - - a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <- - vapply(strsplit( - a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)], - "-"), - function(x) x[2], character(1) - ) - #strip periods from the ends of city,state,country - a_df$city <- gsub("\\.", "", a_df$city) - a_df$state <- gsub("\\.", "", a_df$state) - a_df$country <- gsub("\\.", "", a_df$country) - a_df$country[a_df$country == ""] <- NA - a_df$university[a_df$university == ""] <- NA - a_df$postal_code[a_df$postal_code == ""] <- NA - #convert to lower - for (l in 2:ncol(a_df)){ - a_df[, l] <- tolower(a_df[, l]) - } - - # Select columns - a_df <- a_df[, c("adID", - "university_1", - "country_1", - "state_1", - "postal_code_1", - "city_1", - "department_1", - "second_tier", - "third_tier", - "remain", - "address") - ] - - # Rename columns - colnames(a_df) <- c("adID", - "university", - "country", - "state", - "postal_code", - "city", - "department", - "second_tier", - "third_tier", - "remain", - "address") - - - # sometimes the postal code fails to prse out of state. canm use this - # when postal code is missing, but then need to remove - # Function to extract numbers from one column and copy them to another column - extract_numbers <- function(df, source_col, target_col) { - if (is.na(target_col)) { - - - df[[target_col]] <- gsub(".*?(\\d+).*", "\\1", df[[source_col]]) - df[[target_col]][!grepl("\\d", df[[source_col]])] <- NA - return(df) - - } else { - return(df) - } - - } - - # Apply the function to the dataframe - a_df <- extract_numbers(a_df, "state", "postal_code") - - - - # ther postal code and city are sometimes in tier3 - a_df$city <- ifelse(is.na(a_df$city), a_df$third_tier, a_df$city) - a_df$postal_code <- ifelse(is.na(a_df$postal_code), a_df$third_tier, a_df$postal_code) - - - - - - # Function to remove matching characters from col1 based on col2 - remove_matching <- function(col1, col2) { - pattern <- paste0("\\b", gsub("([\\W])", "\\\\\\1", col2), "\\b") - result <- sub(pattern, "", col1) - trimws(result) - } - - # Apply the function to each row - a_df$city <- mapply(remove_matching, a_df$city, a_df$postal_code) - a_df$state <- mapply(remove_matching, a_df$state, a_df$postal_code) - - - - library(tidyverse) - - - country<- a_df %>% - # mutate(city_match=(city==second_tier)) %>% - # filter(city_match==FALSE) %>% - distinct(country) %>% - mutate(summary=nchar(country)) %>% - arrange(country) - - - country_city<- a_df %>% - # mutate(city_match=(city==second_tier)) %>% - # filter(city_match==FALSE) %>% - distinct(country,city) %>% - mutate(city_char=nchar(city)) %>% - arrange(country,city) - - - - country_state<- a_df %>% - # mutate(city_match=(city==second_tier)) %>% - # filter(city_match==FALSE) %>% - distinct(country,state) %>% - mutate(city_char=nchar(state)) %>% - arrange(country,state) - - country_state_city<- a_df %>% - # mutate(city_match=(city==second_tier)) %>% - # filter(city_match==FALSE) %>% - distinct(country ,state,city) %>% - mutate(city_char=nchar(city)) %>% - arrange(country,state,city) - - - country_state_city_pc<- a_df %>% - # mutate(city_match=(city==second_tier)) %>% - # filter(city_match==FALSE) %>% - distinct(country ,state,postal_code,city) %>% - mutate(city_char=nchar(city)) %>% - arrange(country,state,postal_code,city) - - return(a_df) -} +# +# # BRAZIL ------------------------------------------------------------------ +# +# +# # Function to process addresses for both city and state +# process_brazil_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not brl +# } +# if (x[[length(x)]] == "brazil") { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } +# +# +# return(c(NA, NA)) # Not brl +# } +# +# +# +# # Apply the function across all addresses +# results <- as.data.frame(t(mapply(process_brazil_address, list_address)), +# stringsAsFactors = FALSE) +# colnames(results) <- c("brl_city", "brl_state") +# +# +# +# +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg","andar","empresas", +# "programa", "ciencias", "unidade", "lab ") +# +# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), +# results$brl_state, +# results$brl_city) +# +# +# +# results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), +# results$brl_city, +# results$brl_state) +# +# +# # Define the function +# extract_brl_postcodes <- function(df, source_col, target_col,brl_new_city) { +# # 6 digits +# +# pattern <- "br-[0-9]{5,8}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, brl_new_city] <- df[i, source_col] +# # df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# results$brl_pc<-NA +# results$brl_new_city<-NA +# # df, source_col, target_col,city_col +# results <- extract_brl_postcodes(results, "brl_city","brl_pc", "brl_new_city") +# results <- extract_brl_postcodes(results, "brl_state","brl_pc", "brl_new_city") +# +# +# results$brl_new_city <- ifelse(is.na(results$brl_new_city), +# results$brl_state, +# results$brl_new_city) +# +# +# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, +# results$brl_city, +# results$brl_state) +# +# +# +# +# results$brl_new_city <- ifelse(is.na(results$brl_new_city), +# results$brl_city, +# results$brl_new_city) +# +# +# +# +# results$brl_city<-gsub("br-","",results$brl_city) +# results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) +# results$brl_city<-sub("-","",results$brl_city) +# +# +# results$brl_new_city<-gsub("br-","",results$brl_new_city) +# results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) +# results$brl_new_city<-sub("-","",results$brl_new_city) +# +# +# results$brl_pc<-gsub("br-","",results$brl_pc) +# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) +# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) +# +# results$brl_state<-gsub("br-","",results$brl_state) +# results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) +# results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) +# +# # +# # +# # +# # +# # +# # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) +# # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) +# # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) +# # # +# # results$brl_city<-gsub("-","",results$brl_city) +# # +# # results$brl_state<-gsub("br-","",results$brl_state) +# # results$brl_state<-gsub("-","",results$brl_state) +# +# +# +# +# +# # any without numbers gets NA'd +# results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA +# +# # keep portions with numbers / remove city names +# +# results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) +# results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) +# # +# # # Specific replacements +# # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), +# # "rio de janeiro", results$brl_city) +# +# results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), +# "rio de janeiro", results$brl_city) +# results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), +# "rj", results$brl_state) +# results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), +# "sp", results$brl_state) +# +# +# +# results$brl_city[results$brl_city==results$brl_state]<-NA +# +# +# # Clean up and adjust columns +# results[] <- lapply(results, trimws) +# +# +# # Define city-to-state mapping +# city_state_mapping <- data.frame( +# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), +# state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), +# stringsAsFactors = FALSE +# ) +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$city[i], results$brl_city) +# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brl_state) +# } +# +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# +# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brl_state) +# } +# +# +# +# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, +# results$brl_city, +# results$brl_state) +# +# +# results$brl_city <- trimws(results$brl_city, which = "both") +# results$brl_state <- trimws(results$brl_state, which = "both") +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg", +# "programa", "ciencias", "unidade", "lab ") +# +# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), +# results$brl_state, +# results$brl_city) +# +# +# # Final trimming +# results[] <- lapply(results, trimws) +# +# results$brl_city <- ifelse(results$brl_new_city==results$brl_city, +# NA, +# results$brl_city) +# +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$brl_city, city_list) +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$brl_state, state_list) +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$brl_pc, pc_list) +# +# +# rm(results,city_state_mapping) +# +# +# +# # Handle postal codes (BR-[0-9]) +# results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_state), results$brazil_state, NA) +# results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_city) & is.na(results$brazil_pc), +# results$brazil_city, results$brazil_pc) +# +# # Remove BR codes from city and state +# results$brazil_city <- gsub("br-[0-9]+", "", results$brazil_city) +# results$brazil_state <- gsub("br-[0-9]+", "", results$brazil_state) +# +# # Define city-to-state mapping +# city_state_mapping <- data.frame( +# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "sao paulo"), +# state = c("sp", "sp", "sp", "sp", "rj", "rj", "sp"), +# stringsAsFactors = FALSE +# ) +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# results$brazil_city <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), +# city_state_mapping$city[i], results$brazil_city) +# results$brazil_state <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brazil_state) +# } +# +# # Specific replacements +# results$brazil_city <- ifelse(grepl("museu nacl", results$brazil_city), +# "rio de janeiro", results$brazil_city) +# results$brazil_state <- ifelse(grepl("rio de janeiro", results$brazil_state, ignore.case = TRUE), +# "rj", results$brazil_state) +# results$brazil_state <- ifelse(grepl("sao paulo", results$brazil_state, ignore.case = TRUE), +# "sp", results$brazil_state) +# +# # cleanup +# results$brazil_city[results$brazil_city==results$brazil_state]<-NA +# results$brazil_city <- trimws(results$brazil_city, which = "both") +# results$brazil_state <- trimws(results$brazil_state, which = "both") +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg", +# "programa", "ciencias", "unidade", "lab ") +# +# results$brazil_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brazil_city), +# results$brazil_state, +# results$brazil_city) +# +# # Clean postal codes +# results$brazil_pc <- gsub("[A-Za-z-]", "", results$brazil_pc) +# +# # Final trimming +# results[] <- lapply(results, trimws) +# +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$brazil_city, city_list) +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$brazil_state, state_list) +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$brazil_pc, pc_list) +# +# +# rm(results,city_state_mapping) +# +# +# # CHINA ------------------------------------------------------------------- +# chn_extract <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) +# } else if (x[[length(x)]] == "china") { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) +# } else { +# return(c(NA, NA)) +# } +# } +# +# chn_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) +# names(chn_pc) <- c("chn_city", "chn_state") +# +# # Define the function +# extract_china_postcodes <- function(df, source_col, target_col,chn_new_city) { +# # 6 digits +# pattern <- "[0-9]{6}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, chn_new_city] <- df[i, source_col] +# # df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# chn_pc$chn_pc<-NA +# chn_pc$chn_new_city<-NA +# # df, source_col, target_col,city_col +# chn_pc <- extract_china_postcodes(chn_pc, "chn_city","chn_pc", "chn_new_city") +# chn_pc <- extract_china_postcodes(chn_pc, "chn_state","chn_pc", "chn_new_city") +# +# +# +# # any without numbers gets NA'd +# chn_pc$chn_pc[!grepl("\\d", chn_pc$chn_pc)] <- NA +# +# # keep portions with numbers / remove city names +# chn_pc$chn_new_city<-sub("[0-9]{6}","",chn_pc$chn_new_city) +# chn_pc$chn_city<-sub("[0-9]{6}","",chn_pc$chn_city) +# chn_pc$chn_state<-sub("[0-9]{6}","",chn_pc$chn_state) +# +# +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# +# to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", +# "dept", "div", "univ", "hosp", "coll", "sci", "rd", +# "program","minist", "educ", "sch ", "grad ", "fac ", +# "assoc","forest") +# +# # Define the function +# +# +# # Print the resulting dataframe +# print(a_df) +# +# +# chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")]<-lapply(chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")], trimws) +# +# +# clean_column <- function(column, delete_terms) { +# column <- gsub(paste(delete_terms, collapse = "|"), NA, column) +# column <- gsub("[0-9]", "", column) # Remove digits +# trimws(column) # Remove leading/trailing whitespace +# } +# +# +# # Clean chn_pc1 and chn_pc2 +# chn_pc$chn_city <- clean_column(chn_pc$chn_city, to_delete) +# chn_pc$chn_new_city <- clean_column(chn_pc$chn_new_city, to_delete) +# chn_pc$chn_state <- clean_column(chn_pc$chn_state, to_delete) +# +# +# chn_pc$chn_new_city <- ifelse(is.na(chn_pc$chn_new_city), +# chn_pc$chn_state, +# chn_pc$chn_new_city) +# +# +# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), +# NA,chn_pc$chn_state) +# +# +# +# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_city)==FALSE), +# chn_pc$chn_city,chn_pc$chn_state) +# +# +# +# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), +# NA,chn_pc$chn_state) +# +# chn_pc$chn_state <- gsub(" province", "",chn_pc$chn_state) +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), chn_pc$chn_new_city, city_list) +# +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),chn_pc$chn_state, state_list) +# +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),chn_pc$chn_pc, pc_list) +# +# rm(chn_pc) +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # ### China has some where the postal code is with the city, so fix those here +# # # Extract postal code information from the list +# # chn_extract <- function(x) { +# # if (length(x) == 1) { +# # return(c(NA, NA)) +# # } else if (x[[length(x)]] == "china") { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) +# # } else { +# # return(c(NA, NA)) +# # } +# # } +# +# # Apply extraction to list_address +# # chn_missing_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) +# # names(chn_missing_pc) <- c("chn_pc1", "chn_pc2") +# # +# # # Define words indicating this is actually a dept not state or postal code +# # # will use this list to delete the ones that don't apply +# # to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", +# # "dept", "div", "univ", "hosp", "coll", "sci", "rd","program" +# # "minist", "educ", "sch ", "grad ", "fac ","assoc") +# # +# # +# # +# # # Extract numeric postal codes +# # chn_missing_pc$chn_pc_from1 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc1)) +# # chn_missing_pc$chn_pc_from2 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc2)) +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # clean_column <- function(column, delete_terms) { +# # column <- gsub(paste(delete_terms, collapse = "|"), NA, column) +# # column <- gsub("[0-9]", "", column) # Remove digits +# # trimws(column) # Remove leading/trailing whitespace +# # } +# # +# # # Clean chn_pc1 and chn_pc2 +# # chn_missing_pc$chn_pc1 <- clean_column(chn_missing_pc$chn_pc1, to_delete) +# # chn_missing_pc$chn_pc2 <- clean_column(chn_missing_pc$chn_pc2, to_delete) +# # +# # # Initialize empty columns for final outputs +# # chn_missing_pc$chn_pc <- NA +# # chn_missing_pc$chn_city <- NA +# # chn_missing_pc$chn_state <- NA +# # +# # # Assign postal codes, cities, and states based on conditions +# # assign_chn_data <- function(from1, from2, pc1, pc2) { +# # list( +# # chn_pc = ifelse(is.na(from1) & !is.na(from2), from2, from1), +# # chn_city = ifelse(is.na(from1) & !is.na(from2), pc2, pc1), +# # chn_state = ifelse(is.na(from1) & !is.na(from2), pc1, pc2) +# # ) +# # } +# # +# # chn_result <- assign_chn_data(chn_missing_pc$chn_pc_from1, +# # chn_missing_pc$chn_pc_from2, +# # chn_missing_pc$chn_pc1, +# # chn_missing_pc$chn_pc2) +# # +# # chn_missing_pc$chn_pc <- chn_result$chn_pc +# # chn_missing_pc$chn_city <- gsub("[0-9]", "", chn_result$chn_city) +# # chn_missing_pc$chn_state <- gsub("[0-9]", "", chn_result$chn_state) +# +# # # Define Chinese states and cities +# # chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", +# # "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", +# # "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", +# # "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", +# # "gansu", "inner mongolia", "jilin", "hainan", "ningxia", +# # "qinghai", "tibet", "macao") +# # +# # # All the cities in the addresses, add as needed. +# # chn_cities <- unique(c(chn_missing_pc$chn_city, "lhasa")) +# +# # Update states and cities based on matching conditions +# # chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc1 %in% chn_states, +# # chn_missing_pc$chn_pc1, chn_missing_pc$chn_state) +# # chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc2 %in% chn_states, +# # chn_missing_pc$chn_pc2, chn_missing_pc$chn_state) +# # +# # chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc1 %in% chn_states), +# # chn_missing_pc$chn_pc1, chn_missing_pc$chn_city) +# # chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc2 %in% chn_states), +# # chn_missing_pc$chn_pc2, chn_missing_pc$chn_city) +# # +# # # put the postal codes and cities in the pc_list, state_list +# # pc_list<-ifelse(is.na(pc_list),chn_missing_pc$chn_pc, pc_list) +# # city_list<-ifelse(is.na(city_list),chn_missing_pc$chn_city, city_list) +# # state_list<-ifelse(is.na(state_list),chn_missing_pc$chn_state, state_list) +# # +# # +# # rm(chn_cities,chn_states,to_delete,chn_result,chn_missing_pc) +# +# +# +# +# +# +# +# +# +# # UK ------------------------------------------------------------------ +# +# process_uk_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not uk +# } +# +# if ((x[[length(x)]] == "england")| +# (x[[length(x)]] == "scotland")| +# (x[[length(x)]] == "wales")| +# (x[[length(x)]] == "northern ireland")) { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } +# +# return(c(NA, NA)) # Not uk +# } +# +# # Apply the function across all addresses using `mapply` +# results <- t(mapply(process_uk_address, list_address)) +# colnames(results) <- c("uk_city", "uk_state") +# +# +# +# +# # Clean up results +# results <- as.data.frame(results, stringsAsFactors = FALSE) +# results$uk_city[results$uk_city == "not uk"] <- NA +# results$uk_state[results$uk_state == "not uk"] <- NA +# +# +# +# results$uk_pc<-NA +# +# # Define the function +# extract_uk_postcodes <- function(df, source_col, target_col,city_col) { +# # Regular expression pattern for UK postal codes +# # One or two initial letters. +# # One or two digits (and possibly a letter). +# # A mandatory space. +# # One digit followed by two letters. +# pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" +# pattern2 <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" +# pattern3 <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" +# +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# # Example usage +# +# results <- extract_uk_postcodes(results, "uk_city","uk_pc", "uk_city") +# results <- extract_uk_postcodes(results, "uk_state","uk_pc", "uk_city") +# +# # any without numbers gets NA'd +# results$uk_pc[!grepl("\\d", results$uk_pc)] <- NA +# +# +# +# results$new_city<-NA +# +# +# # Define the function +# uk_city_id <- function(df, source_col, target_col) { +# # Regular expression pattern for UK postal codes +# # One or two initial letters. +# # One or two digits (and possibly a letter). +# # A mandatory space. +# # One digit followed by two letters. +# pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- df[i, source_col] +# } +# } +# return(df) +# } +# +# # Example usage +# +# results <- uk_city_id(results, "uk_city","new_city") +# results <- uk_city_id(results, "uk_state","new_city") +# +# +# results$new_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$new_city) +# +# +# +# +# # Define the function +# uk_city_id <- function(df, source_col, target_col) { +# # Regular expression pattern for UK postal codes +# # One or two initial letters. +# # One or two digits (and possibly a letter). +# # A mandatory space. +# # One digit followed by two letters. +# pattern <- "\\s[A-Za-z0-9]{2,4}\\s[A-Za-z0-9]{2,4}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- df[i, source_col] +# } +# } +# return(df) +# } +# +# # Example usage +# results <- uk_city_id(results, "uk_state","new_city") +# +# +# +# results$uk_state<-ifelse(results$uk_state==results$uk_city, +# "",results$uk_state) +# +# results$new_city<-ifelse(is.na(results$new_city), +# results$uk_city, +# results$new_city) +# # remove zip codes from new city +# results$new_city<-gsub("\\s[A-Za-z0-9]{3}\\s[A-Za-z0-9]{3}","",results$new_city) +# results$new_city<-gsub("\\s[A-Za-z0-9]{4}\\s[A-Za-z0-9]{2,3}","",results$new_city) +# +# +# results$new_city<-ifelse(results$uk_state=="london", +# "london", +# results$new_city) +# +# +# results$uk_state<-ifelse(results$uk_state=="london", +# NA, +# results$uk_state) +# +# +# +# +# +# +# +# +# +# +# # keep portions with numbers / remove city names +# # results$uk_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_city) +# # results$uk_state<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_state) +# # results$uk_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$uk_pc) +# +# # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr +# +# results$uk_pc[results$uk_pc == ""] <- NA +# # now remove any PC from city +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$new_city, city_list) +# +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$uk_state, state_list) +# +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$uk_pc, pc_list) +# +# rm(results) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# # Extracts postal code when combined with city name ----------------------- +# +# city_list <- trimws(city_list, which = "both") +# +# +# +# city_clean<-data.frame( +# authorID=ID, +# addresses=addresses, +# original_city=city_list, +# city_list=city_list, +# state_list=state_list, +# country_list=country_list, +# extract_pc=pc_list) +# +# +# # # England, Scotland, Wales --------------------------------------------- +# # +# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc)& (city_clean$country_list=="england"| +# # city_clean$country_list=="wales" | +# # city_clean$country_list=="scotland"), +# # gsub(".*\\s([a-z0-9]+\\s[a-z0-9]+)$", "\\1", city_clean$city_list), +# # city_clean$extract_pc) +# # +# # # This then deletes the postal code from the city name +# # city_clean$city_list<-ifelse((city_clean$country_list=="england"| +# # city_clean$country_list=="wales" | +# # city_clean$country_list=="scotland"), +# # (gsub("([A-Za-z0-9]+\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), +# # city_clean$city_list) +# # +# city_clean[city_clean == ""] <- NA +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- "\\b[A-Za-z]{1,2}[-][0-9]{4,8}\\b" +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# } +# } +# return(df) +# } +# +# # Example usage +# +# city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") +# +# +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- " [0-9]{3,9}" +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# } +# } +# return(df) +# } +# +# +# # Example usage +# +# city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") +# +# +# +# # Define the function +# delete_matching_text <- function(df, col_a, col_b) { +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Check if the value in column A is not NA and is found within the text in column B +# if(!is.na(df[i, col_a]) && grepl(df[i, col_a], df[i, col_b])) { +# # Remove the matching text from column B by replacing it with an empty string +# df[i, col_b] <- gsub(df[i, col_a], "", df[i, col_b]) +# } +# } +# return(df) +# } +# +# city_clean <- delete_matching_text(city_clean, "extract_pc","city_list") +# +# city_clean <- delete_matching_text(city_clean, "extract_pc","state_list") +# +# +# # remove state if same as city +# +# city_clean$state_list<-ifelse(city_clean$state_list==city_clean$city_list,NA, city_clean$state_list) +# +# +# # there are some usa ones that had state zip buyt no country +# +# +# # Define the function +# extract_postcodes <- function(df, country_list, extract_pc, state_list) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- "\\b[A-Za-z]{2} [0-9]{5}\\b" +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, country_list]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, country_list], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, extract_pc])) { +# df[i, extract_pc] <- extracted_codes[[1]][1] +# df[i, state_list] <- extracted_codes[[1]][1] +# df[i, country_list] <- "usa" +# } +# } +# return(df) +# } +# +# +# # Example usage +# +# city_clean <- extract_postcodes(city_clean, +# "country_list", "extract_pc", "state_list") +# +# +# # remove zip codes from states +# city_clean$state_list<-ifelse(city_clean$country_list=="usa", +# gsub("[0-9]","",city_clean$state_list), +# city_clean$state_list) +# +# # remove state from zipcode +# city_clean$extract_pc<-ifelse(city_clean$country_list=="usa", +# gsub("[a-z]","",city_clean$extract_pc), +# city_clean$extract_pc) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# city_clean$extract_pc <- trimws(gsub("&", "", city_clean$extract_pc)) +# city_clean[city_clean == ""] <- NA +# +# # +# # +# # +# # +# # +# head(city_clean) +# # +# # # Cleaning up city names with zip codes in them +# # # take the zip code and put it in a new column before deleting +# # +# # +# # +# # +# # # 2) Countries with postal code AFTER city -------------------------------- +# # +# # +# # city_zip<-c("ireland","qatar","kazakhstan","peru","turkey","south korea", +# # "japan","costa rica","mexico","new zealand","iran","thailand", +# # "russia","spain","india","singapore","indonesia","chile", +# # "finland","colombia","taiwan","saudi arabia","uruguay", +# # "slovenia","spain") +# # +# # +# # +# # city_clean$extract_pc<-ifelse((is.na(city_clean$extract_pc)& (city_clean$country_list %in% city_zip)), +# # (gsub(".*[A-Za-z]+", "", city_clean$city_list)), +# # city_clean$extract_pc) +# # # +# # city_clean$city_list<-ifelse((city_clean$country_list %in% city_zip), +# # gsub("\\s([0-9]+)", "", city_clean$city_list), +# # city_clean$city_list) +# # +# # city_clean[city_clean == ""] <- NA +# # +# # +# # # 3) Postal code and dash BEFORE city name -------------------------------- +# # +# # +# # zip_dash<-c("finland","slovakia","austria","portugal","belgium", +# # "spain","israel","czech republic","argentina","france", +# # "sweden","switzerland","turkey","germany","italy", +# # "lithuania","hungary","denmark","poland","norway", "iceland", +# # "greece", "ukraine","estonia","latvia","luxembourg","lativa", +# # "south africa","bulgaria","brazil") +# # +# # +# # +# # +# # city_clean$extract_pc <- ifelse((is.na(city_clean$extract_pc) & +# # (city_clean$country_list %in% zip_dash)), +# # # gsub("\\s([A-Za-z]+)", "", city_clean$city_list), +# # sub(" .*", "", city_clean$city_list), +# # city_clean$extract_pc) +# # +# # +# # city_clean$city_list<-ifelse((city_clean$country_list %in% zip_dash), +# # gsub("[a-zA-Z]+-[0-9]+", "", city_clean$city_list), +# # city_clean$city_list) +# # +# # +# # city_clean[city_clean == ""] <- NA +# # +# # # 4) Netherlands Postal Code ---------------------------------------------- +# # +# # # Netherlands has Postal Code before +# # # it is a combination of 2-3 blocks of letters and numbers +# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="netherlands"), +# # (gsub("^(([^ ]+ ){2}).*", "\\1", city_clean$city_list)), +# # city_clean$extract_pc) +# # city_clean$city_list<-ifelse((city_clean$country_list=="netherlands"), +# # (gsub(".*\\s", "", city_clean$city_list)), +# # city_clean$city_list) +# # city_clean[city_clean == ""] <- NA +# # +# # # 5) Venezuela ----------------------------------------------------------- +# # +# # # Venezuela has postal code after, it is combo of letters and numbers +# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="venezuela"), +# # gsub(".*\\s([a-z0-9]+)$", "\\1", city_clean$city_list), +# # city_clean$extract_pc) +# # +# # # This then deletes the postal code from the city name +# # city_clean$city_list<-ifelse(city_clean$country_list=="venezuela", +# # (gsub("(\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), +# # city_clean$city_list) +# # city_clean[city_clean == ""] <- NA +# # +# # +# # +# # # trim ws +# # city_clean$extract_pc <- trimws(city_clean$extract_pc, which = "both") +# # city_clean$city_list <- trimws(city_clean$city_list, which = "both") +# # # This removes any that don't have numbers in them +# # city_clean$extract_pc[!grepl("[0-9]", city_clean$extract_pc)] <- NA +# # city_clean$city_list <- gsub("[0-9]+", "", city_clean$city_list) +# # +# # +# # Final Clean Up ---------------------------------------------------------- +# +# # Russia +# city_clean$city_list<-ifelse(city_clean$country_list=="russia", +# (gsub("st Petersburg p", "st petersburg", city_clean$city_list)), +# city_clean$city_list) +# +# +# +# # India +# India_delete<- c("dept") +# +# city_clean$city_list <- ifelse(city_clean$country_list=="india", +# gsub(paste(India_delete, collapse = "|"), NA, city_clean$city_list), +# city_clean$city_list) +# city_clean$city_list<-trimws(city_clean$city_list) # Remove leading/trailing whitespace +# +# # brazil +# city_clean$extract_pc<-ifelse((city_clean$country_list=="brazil" & nchar(city_clean$extract_pc) < 5), +# "",city_clean$extract_pc) +# city_clean[city_clean == ""] <- NA +# +# brazil_delete<- c("barretos canc hosp","univ fed sao paulo", +# "escola filosofia letras & ciencias humanas", +# "hosp sirio libanes","perola byington hosp", +# "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", +# "lab","zoologia", "inst", "programa","ppg", "ppg") +# +# +# city_clean$city_list <- ifelse(city_clean$country_list=="brazil", +# gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), +# city_clean$city_list) +# +# city_clean$city_list<-trimws(city_clean$city_list, which="both") # Remove leading/trailing whitespace +# city_clean$state_list<-trimws(city_clean$state_list, which="both") # Remove leading/trailing whitespace +# city_clean$country_list<-trimws(city_clean$country_list, which="both") # Remove leading/trailing whitespace) # Remove leading/trailing whitespace +# # City Abbreviations +# +# city_clean$city_list<-ifelse(city_clean$city_list=="university pk", +# "university park", +# city_clean$city_list) +# +# +# +# city_clean$city_list<-ifelse(city_clean$city_list=="college stn", +# "college station", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse(city_clean$city_list=="n chicago", +# "north chicago", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse(city_clean$city_list=="college pk", +# "college park", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse(city_clean$city_list=="research triangle pk", +# "research triangle park", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse(city_clean$city_list=="state coll", +# "state college", +# city_clean$city_list) +# +# # city corrections +# city_clean$city_list<-ifelse((city_clean$city_list=="dehra dun" & city_clean$country_list == "india"), +# "dehradun", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse((city_clean$city_list=="st john" & city_clean$country_list == "canada"), +# "st. john's", +# city_clean$city_list) +# +# city_clean$state_list<-ifelse(city_clean$state_list=="london ", +# "london", +# city_clean$state_list) +# +# city_clean$city_list<-ifelse((city_clean$state_list=="london" & city_clean$country_list == "england"), +# "london", +# city_clean$city_list) +# +# +# city_clean$state_list<-ifelse(city_clean$state_list=="london", +# NA, +# city_clean$state_list) +# +# +# +# city_list<-city_clean$city_list +# state_list<-city_clean$state_list +# pc_list<-city_clean$extract_pc +# country_list<-city_clean$country_list +# +# +# rm(brazil_delete,India_delete) +# +# +# # rm(city_clean) +# +# +# pc_list[pc_list == ""] <- NA +# city_list[city_list == ""] <- NA +# state_list[state_list == ""] <- NA +# dept_list[dept_list == ""] <- NA +# country_list[country_list == ""] <- NA +# # Create the df that will be returned +# cleaned_ad<-data.frame(ID, +# addresses, +# university_list, +# dept_list, +# city_list, +# country_list, +# state_list, +# pc_list) +# +# +# +# # names(cleaned_ad) +# +# +# list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) +# +# # Because formats of address printing is different across platforms +# # We are going to split using a tier system assuming first and last +# # info is somewhat reliable and guess the other info from the +# # remaining position of the info +# +# second_tier_list <- lapply(list_address1, function(x) x[length(x)]) +# second_tier_list <- trimws(second_tier_list, which = "both") +# second_tier_list[second_tier_list == "character(0)"] <- NA +# +# list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) +# +# third_tier_list <- lapply(list_address2, function(x) x[length(x)]) +# third_tier_list <- trimws(third_tier_list, which = "both") +# third_tier_list[third_tier_list == "character(0)"] <- NA +# +# # All remaining info is just shoved in this category +# remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) +# remain_list <- trimws(remain_list, which = "both") +# remain_list[remain_list == "character(0)"] <- NA +# +# +# # original +# # a_df <- data.frame( +# # adID = ID, university = university_list, +# # country = country_list, +# # state = state_list, postal_code = pc_list, city = NA, +# # department = NA, second_tier = second_tier_list, +# # third_tier = third_tier_list, +# # remain = remain_list, address = addresses, +# # stringsAsFactors = FALSE +# # ) +# +# # EB EDIT +# a_df_1 <- data.frame( +# adID = ID, +# university_1 = university_list, +# university = university_list, +# country_1 = country_list, +# country = country_list, +# state_1 = state_list, +# state = state_list, +# postal_code_1 = pc_list, +# postal_code = pc_list, +# city_1 = city_list, +# city = city_list, +# department_1 = dept_list, +# department = dept_list, +# second_tier = second_tier_list, +# third_tier = third_tier_list, +# remain = remain_list, +# address = addresses, +# stringsAsFactors = FALSE +# ) +# +# a_df_1$city_list<-ifelse(is.na(a_df_1$city_1), +# a_df_1$city, +# a_df_1$city_1) +# +# a_df_1$postal_code_1<-ifelse(is.na(a_df_1$postal_code_1), +# a_df_1$postal_code, +# a_df_1$postal_code_1) +# +# +# # Quebec Postal Code is PQ (Fr) or QC (En) but only QC is georeferenced +# a_df_1$state_1<-ifelse(a_df_1$state_1=="pq" & a_df_1$country_1 == "canada", +# "qc", +# a_df_1$state_1) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# a_df<-a_df_1 +# +# rm(a_df_1) +# +# +# +# # try to fix the usa spots, which vary in format than other countries +# a_df$state<-ifelse(is.na(a_df$state),"",a_df$state) # added by eb to deal with index problem +# a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0] +# a_df$state[nchar(a_df$state) == 0] <- NA +# a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA +# a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) & +# !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) & +# !is.na(a_df$postal_code) & !is.na(a_df$state)] +# # fix a US problem when usa is not tacked onto the end +# +# us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}" +# a_df$state[ grepl(us_reg, a_df$country) ] <- +# substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2) +# +# a_df$postal_code[ grepl(us_reg, a_df$country) ] <- +# substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8) +# +# a_df$country[grepl(us_reg, a_df$country)] <- "usa" +# +# +# a_df$state_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", +# a_df$state, +# a_df$state_1) +# +# a_df$postal_code_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", +# a_df$postal_code, +# a_df$postal_code_1) +# +# +# a_df$country_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", +# a_df$country, +# a_df$country_1) +# +# +# ########################## +# # We'll use regular expression to pull zipcodes +# # These formats differ by region +# int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}" +# int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]", +# "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="") +# int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}" +# int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" +# int <- paste(int1, int2, int3, int4, sep = "|") +# +# uk <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]", +# "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="") +# +# mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well +# +# panama <- "[[:digit:]]{4}-[[:digit:]]{5}" +# +# zip_search <- paste0(int, "|", uk, "|", mexico, "|", panama) +# +# +# +# +# +# # ADDRD EB INSTEAD OF +# a_df$city_1 <- ifelse(is.na(a_df$city_1),a_df$third_tier,a_df$city_1) +# a_df$state_1 <- ifelse(is.na(a_df$state_1),a_df$second_tier,a_df$state_1) +# a_df$postal_code_1 <- ifelse(is.na(a_df$postal_code_1),a_df$second_tier,a_df$postal_code_1) +# +# +# # fix country - usa +# # Function to remove everything before " usa" +# remove_before_usa <- function(x) { +# if (grepl(" usa", x)) { +# return(sub(".*(?= usa)", "", x, perl = TRUE)) +# } else { +# return(x) +# } +# } +# +# # Apply the function to each element in the vector +# a_df$country_1 <- sapply(a_df$country_1, remove_before_usa) +# a_df$country_1 <- trimws(a_df$country_1, which = "both") +# +# +# a_df$state_1 <- ifelse(a_df$country_1=="usa", +# (trimws(gsub("[0-9]", "", a_df$state_1), which="both")), +# a_df$state_1) +# +# a_df$postal_code_1 <- ifelse(a_df$country_1=="usa", +# (trimws(gsub("[a-z]", "", a_df$postal_code_1), which="both")), +# a_df$postal_code_1) +# +# +# +# ########################### +# id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) & +# a_df$address != "Could not be extracted"] +# ########################### +# +# # We now iteratively run through the addresses using the concept that +# # certain information always exists next to each other. +# # Ex. city, state, country tend to exist next to each other. +# # We use the position of the zipcode also to help guide us +# # in where the information lies as well as how many fields were +# # given to us. +# for (i in id_run) { +# found <- FALSE +# row <- which(a_df$adID == i) +# university <- a_df$university[row] +# second_tier <- a_df$second_tier[row] +# third_tier <- a_df$third_tier[row] +# remain <- a_df$remain[row] +# city <- a_df$city[row] +# state <- a_df$state[row] +# postal_code <- a_df$postal_code[row] +# department <- a_df$department[row] +# grepl(zip_search, second_tier) +# grepl(zip_search, third_tier) +# # 2nd tier +# if (grepl(zip_search, second_tier)) { +# found <- TRUE +# postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier)) +# city <- gsub(zip_search, "", second_tier) +# department <- ifelse(is.na(remain), third_tier, remain) +# } +# # 3RD tiers +# if (grepl(zip_search, third_tier) & !found) { +# found <- TRUE +# postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier)) +# city <- gsub(zip_search, "", third_tier) +# state <- second_tier +# department <- remain +# } +# +# if (!found) { +# state <- second_tier +# city <- third_tier +# department <- remain +# } +# # To make university searching more efficient we'll override values +# # based on if it has university/college in the name, +# # where university overides college +# override_univ <- grepl("\\buniv\\b|\\buniversi", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) & +# !grepl("\\bdrv\\b|\\bdrive\\b", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) +# +# if (any(override_univ)) { +# university <- +# c(second_tier, third_tier, remain, city, university)[override_univ][1] +# assign( +# c("second_tier", "third_tier", "remain", "city", "university")[ +# override_univ][1], +# NA +# ) +# } +# # only if university doesnt already exist +# override_univ_col <- +# grepl("\\bcol\\b|college|\\bcoll\\b", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) & +# !grepl("\\bdrv\\b|\\bdrive\\b", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) +# +# if (!any(override_univ) & any(override_univ_col)) { +# university <- +# c(second_tier, third_tier, remain, city, university )[ +# override_univ_col][1] +# +# assign( +# c("second_tier", "third_tier", "remain", "city", "university")[ +# override_univ_col][1], +# NA +# ) +# } +# # more risky, but institutions as well, just incase its not a university +# override_univ_inst <- grepl("\\binst\\b|\\binstitut", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) +# if ( +# !any(override_univ) & !any(override_univ_col) & any(override_univ_inst) +# ) { +# department <- c(second_tier, third_tier, remain, city, university )[ +# override_univ_inst][1] +# +# assign( +# c("second_tier", "third_tier", "remain", "city", "university")[ +# override_univ_inst][1], +# NA +# ) +# } +# +# a_df$city[row] <- gsub("[[:digit:]]", "", city) +# a_df$state[row] <- gsub("[[:digit:]]", "", state) +# a_df$postal_code[row] <- postal_code +# a_df$department[row] <- department +# +# +# +# #########################Clock############################### +# total <- length(id_run) +# pb <- utils::txtProgressBar(min = 0, max = total, style = 3) +# utils::setTxtProgressBar(pb, which(id_run == i)) +# ############################################################# +# } +# +# +# city_fix <- is.na(a_df$city) & !is.na(a_df$state) +# a_df$city[city_fix] <- a_df$state[city_fix] +# a_df$state[city_fix] <- NA +# a_df$university[a_df$university == "Could not be extracted"] <- NA +# a_df$country[a_df$country == "Could not be extracted"] <- NA +# # a_df$country[a_df$country == "peoples r china"] <- "China" +# # a_df$country[a_df$country == "U Arab Emirates"] <- "United Arab Emirates" +# # a_df$country[a_df$country == "Mongol Peo Rep"] <- "Mongolia" +# +# a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <- +# vapply(strsplit( +# a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)], +# "-"), +# function(x) x[2], character(1) +# ) +# #strip periods from the ends of city,state,country +# a_df$city <- gsub("\\.", "", a_df$city) +# a_df$state <- gsub("\\.", "", a_df$state) +# a_df$country <- gsub("\\.", "", a_df$country) +# a_df$country[a_df$country == ""] <- NA +# a_df$university[a_df$university == ""] <- NA +# a_df$postal_code[a_df$postal_code == ""] <- NA +# #convert to lower +# for (l in 2:ncol(a_df)){ +# a_df[, l] <- tolower(a_df[, l]) +# } +# +# # Select columns +# a_df <- a_df[, c("adID", +# "university_1", +# "country_1", +# "state_1", +# "postal_code_1", +# "city_1", +# "department_1", +# "second_tier", +# "third_tier", +# "remain", +# "address") +# ] +# +# # Rename columns +# colnames(a_df) <- c("adID", +# "university", +# "country", +# "state", +# "postal_code", +# "city", +# "department", +# "second_tier", +# "third_tier", +# "remain", +# "address") +# +# +# # sometimes the postal code fails to prse out of state. canm use this +# # when postal code is missing, but then need to remove +# # Function to extract numbers from one column and copy them to another column +# extract_numbers <- function(df, source_col, target_col) { +# if (is.na(target_col)) { +# +# +# df[[target_col]] <- gsub(".*?(\\d+).*", "\\1", df[[source_col]]) +# df[[target_col]][!grepl("\\d", df[[source_col]])] <- NA +# return(df) +# +# } else { +# return(df) +# } +# +# } +# +# # Apply the function to the dataframe +# a_df <- extract_numbers(a_df, "state", "postal_code") +# +# +# +# # ther postal code and city are sometimes in tier3 +# a_df$city <- ifelse(is.na(a_df$city), a_df$third_tier, a_df$city) +# a_df$postal_code <- ifelse(is.na(a_df$postal_code), a_df$third_tier, a_df$postal_code) +# +# +# +# +# +# # Function to remove matching characters from col1 based on col2 +# remove_matching <- function(col1, col2) { +# pattern <- paste0("\\b", gsub("([\\W])", "\\\\\\1", col2), "\\b") +# result <- sub(pattern, "", col1) +# trimws(result) +# } +# +# # Apply the function to each row +# a_df$city <- mapply(remove_matching, a_df$city, a_df$postal_code) +# a_df$state <- mapply(remove_matching, a_df$state, a_df$postal_code) +# +# +# +# library(tidyverse) +# +# +# country<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country) %>% +# mutate(summary=nchar(country)) %>% +# arrange(country) +# +# +# country_city<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,city) +# +# +# +# country_state<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country,state) %>% +# mutate(city_char=nchar(state)) %>% +# arrange(country,state) +# +# country_state_city<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country ,state,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,state,city) +# +# +# country_state_city_pc<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country ,state,postal_code,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,state,postal_code,city) +# +# return(a_df) +# } +# +# +# +# +# +# +# +# +# +# +# #. old +# country_list <- vapply(list_address, function(x) { +# gsub("\\_", "", x[length(x)]) }, +# character(1)) +# country_list <- trimws(country_list, which = "both") +# pc_list <- rep(NA, length(list_address)) +# state_list <- rep(NA, length(list_address)) +# city_list<- rep(NA, length(list_address)) +# country_list <- ifelse(grepl("usa", country_list), "usa", country_list) +# +# list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) +# +# +# +# pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("usa", +# country_list), function(x) x[1], numeric(1))) - 1), which = "right") +# state_list <- pc_list +# +# state_list[nchar(state_list) > 0] <- regmatches( +# state_list[nchar(state_list) > 0], +# regexpr("[[:lower:]]{2}", state_list[nchar(state_list) > 0]) +# ) +# state_list[state_list == ""] <- NA +# +# pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2], +# regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2])) +# pc_list[nchar(pc_list) < 3] <- "" +# pc_list[pc_list == ""] <- NA +# +# +# +# +# city_list <- lapply(list_address1, function(x) x[length(x)]) +# city_list <- trimws(city_list, which = "both") +# city_list[city_list == "character(0)"] <- NA +# +# list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) +# +# dept_list <- lapply(list_address2, function(x) x[length(x)]) +# dept_list <- trimws(dept_list, which = "both") +# dept_list[dept_list == "character(0)"] <- NA +# +# # All remaining info is just shoved in this category +# remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) +# remain_list <- trimws(remain_list, which = "both") +# remain_list[remain_list == "character(0)"] <- NA +# +# +# a_df <- data.frame( +# adID = ID, +# university = university_list, +# country = country_list, +# city = city_list, +# state = state_list, +# postal_code = pc_list, +# department = dept_list, +# remain = remain_list, +# address = addresses, +# stringsAsFactors = FALSE +# ) +# +# +# +# +# # # extracting postal codes - USA ------------------------------------------- +# +# # # USA -------------------------------------------------------------------- +# # +# # process_usa_address <- function(x) { +# # if (length(x) == 1) { +# # return(c(NA, NA)) # Not usa +# # } +# # if (grepl(" usa", x[[length(x)]])) { +# # if (length(x) == 4) { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) +# # } +# # if (length(x) == 5) { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) +# # } +# # if (length(x) == 3) { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) +# # } else { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) +# # } +# # } +# # +# # return(c(NA, NA)) # Not usa +# # } +# # +# # +# # # Apply the function across all addresses using `mapply` +# # results <- t(mapply(process_usa_address, list_address)) +# # colnames(results) <- c("usa_city", "usa_state") +# # +# # results<-as.data.frame(results) +# # results$pc<-NA +# # results$country<-NA +# # extract_usa_postcodes <- function(df, usa_state, pc,country) { +# # # 6 digits +# # pattern <- "[0-9]{5}" +# # +# # # Loop through each row of the dataframe +# # for(i in 1:nrow(df)) { +# # # Find all matches of the pattern in the source column +# # matches <- gregexpr(pattern, df[i, usa_state]) +# # # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # # Extract the matches +# # extracted_codes <- regmatches(df[i, usa_state], matches) +# # # If there's at least one match and the target column is NA, copy the first match to the target column +# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, pc])) { +# # df[i, pc] <- extracted_codes[[1]][1] +# # df[i, country] <- "usa" +# # # df[i, city_col] <- df[i, source_col] +# # +# # } +# # } +# # return(df) +# # } +# # +# # +# # results <- extract_usa_postcodes(results, "usa_state", "pc","country") +# # +# # +# # +# # # Update `city_list` if necessary +# # city_list <- ifelse(is.na(city_list), results$usa_city, city_list) +# # # +# # # # Update `state_list` if necessary +# # state_list<-ifelse(is.na(state_list),results$usa_state, state_list) +# # # +# # # # Update `pc_list` if necessary +# # pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) +# # # Update `country_list` if necessary +# # country_list<-ifelse(is.na(country_list),results$country, country_list) +# # +# # +# # +# # # Because formats of address printing is different across platforms +# # # We are going to split using a tier system assuming first and last +# # # info is somewhat reliable and guess the other info from the +# # # remaining position of the info +# # +# # +# # +# # +# # any without numbers gets NA'd +# # results$pc[!grepl("\\d", results$pc)] <- NA +# +# extract_usa_postcodes <- function(df, source, dest1, dest2,dest3) { +# # state and zip +# pattern <- "[a-z]{2} [0-9]{5}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# if (df[i, dest3]=="usa"){ +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# } +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { +# df[i, dest1] <- extracted_codes[[1]][1] +# } +# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { +# # df[i, dest2] <- df[i, source] +# # } +# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { +# # df[i, dest3] <- "usa" +# # } +# } +# return(df) +# } +# +# +# +# a_df <- extract_usa_postcodes(a_df, "city", "postal_code", "state","country") +# a_df <- extract_usa_postcodes(a_df, "country","postal_code","state","country") +# +# a_df$city<- ifelse(a_df$country=="usa" & ((a_df$city==a_df$state) & (a_df$state==a_df$postal_code)), +# a_df$department,a_df$city) +# +# # keep portions with numbers / remove city names +# a_df$state<-ifelse(a_df$country=="usa", sub(" .*", "",results$usa_state), a_df$state) +# a_df$state<-ifelse(a_df$country=="usa", sub(" usa","",results$usa_state), a_df$state) +# results$usa_state<-sub("[0-9]{5}","",results$usa_state) +# results$usa_state<-sub("usa","",results$usa_state) +# results$usa_state<-trimws(results$usa_state, which = "both") +# +# results$country<-ifelse(is.na(results$usa_city)==FALSE,"usa",results$country) +# +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$usa_city, city_list) +# # +# # # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$usa_state, state_list) +# # +# # # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) +# # Update `country_list` if necessary +# +# country_list<-ifelse((grepl("usa",country_list)),"usa", country_list) +# # remove any with "state_abbrev zip code" but no USA +# country_list <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", country_list, ignore.case = TRUE), "usa", country_list) +# +# +# +# us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", +# "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", +# "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", +# "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", +# "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") +# +# country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) +# +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$usa_city, city_list) +# # +# # # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$usa_state, state_list) +# # +# # # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) +# # Update `country_list` if necessary +# +# rm(results) +# +# +# # a_df$postal_code <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), a_df$country, a_df$postal_code) +# # a_df$state <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), a_df$country, a_df$state) +# # a_df$state <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$state, ignore.case = TRUE), gsub("[0-9]{5}","",a_df$state), a_df$state) +# # a_df$postal_code <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$postal_code, ignore.case = TRUE), gsub("[a-z]{2}","",a_df$postal_code), a_df$postal_code) +# # a_df$country <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), "usa", a_df$country) +# # +# # a_df$state <-trimws(a_df$state,which = "both") +# # a_df$postal_code<- trimws(a_df$postal_code,which = "both") +# # a_df$country <- trimws(a_df$country,which = "both") +# +# +# +# # Postal Codes letters-numbers -------------------------------------------- +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col,sequence) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- sequence +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# } +# } +# return(df) +# } +# +# # usage +# +# sequence<-"\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" +# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) +# sequence<-"\\bbr[-][0-9]{2} [0-9]{5}\\b" +# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) +# sequence<-"\\b[0-9]{5}-[0-9]{3}\\b" +# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) +# +# +# # postal codes - numbers after -------------------------------------------- +# sequence<-"\\b [0-9]{3,8}\\b" +# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) +# +# +# +# # postal codes - uk+ ------------------------------------------------------ +# +# # Define the function +# extract_uk_postcodes <- function(df, source_col, target_col,city_col,sequence) { +# # Regular expression pattern for UK postal codes +# # One or two initial letters. +# # One or two digits (and possibly a letter). +# # A mandatory space. +# # One digit followed by two letters. +# pattern <- sequence +# +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# # Example usage +# sequence <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" +# # extract_uk_postcodes <- function(df, source_col, target_col,city_col,sequence) +# +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{2}[0-9R]{3}[A-Za-z]{2}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{1}[0-9]{3} [0-9]{1}[A-Za-z]{2}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[0-9]{4} [0-9]{2}[A-Za-z]{1}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{2} [A-Za-z]{1} [0-9]{3}[A-Za-z]{2}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# +# # postal codes - canada --------------------------------------------------- +# a_df$postal_code <- ifelse(a_df$country=="canada", NA,a_df$postal_code) +# +# a_df$state <- ifelse(a_df$country=="canada" & +# grepl("[A-Za-z]{2}", a_df$city, ignore.case = TRUE), +# a_df$city, a_df$state) +# +# a_df$city <- ifelse(a_df$country=="canada" & +# a_df$city==a_df$state, +# NA, a_df$city) +# +# +# +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col,sequence) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- sequence +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# } +# } +# return(df) +# } +# +# # usage +# +# sequence<-"\\b[A-Za-z]{1}[0-9]{1}[A-Za-z]{1} [0-9]{1}[A-Za-z]{1}[0-9]{1}\\b" +# a_df <- extract_postcodes(a_df, "state","postal_code",sequence) +# +# sequence<-"\\b[A-Za-z]{1}[0-9]{1}[A-Za-z]{1} [0-9]{1}[A-Za-z]{1}[0-9]{1}\\b" +# a_df <- extract_postcodes(a_df, "department","postal_code",sequence) +# +# a_df$state<-ifelse(a_df$country=="canada", sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", a_df$state),a_df$state) # all after first space +# +# +# a_df$city<-ifelse(a_df$country=="canada" & is.na(a_df$city),a_df$department,a_df$city) # all after first space +# +# a_df$city<-ifelse(a_df$country=="canada", sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", a_df$city),a_df$city) # all after first space +# +# a_df$state<-ifelse(a_df$country=="canada", gsub("[0-9]", "", a_df$state),a_df$state) +# +# +# +# # postal codes - india ---------------------------------------------------- +# +# a_df$state<-ifelse(a_df$country=="india", a_df$city,a_df$state) +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col,city_col,sequence) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- sequence +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, city_col] <- df[i, source_col] +# } +# } +# return(df) +# } +# +# # usage +# +# sequence<-"[0-9]{5,8}" +# a_df <- extract_postcodes(a_df, "department","postal_code","city",sequence) +# +# +# a_df$department<-ifelse(a_df$country=="india" & a_df$city==a_df$department,NA,a_df$department) +# a_df$city<-ifelse(a_df$country=="india" & a_df$city==a_df$state,NA,a_df$city) +# a_df$city<-ifelse(a_df$country=="india" & is.na(a_df$city),a_df$department,a_df$city) +# +# +# # Define the function +# extract_postcodes2 <- function(df, source_col, target_col,city_col,sequence) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- sequence +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0) { +# df[i, city_col] <- df[i, source_col] +# } +# } +# return(df) +# } +# +# # usage +# +# sequence<-"[0-9]{5,8}" +# a_df <- extract_postcodes2(a_df, "postal_code","city","city",sequence) +# +# a_df$city <- ifelse(a_df$country=="india" & (grepl("delhi", a_df$city)|grepl("delhi", a_df$state)), +# "new delhi", a_df$city) +# +# +# +# +# +# # postal codes - australia ------------------------------------------------- +# +# a_df$state<-ifelse(a_df$country=="australia", a_df$city,a_df$state) +# a_df$city<-ifelse(a_df$country=="australia", a_df$department,a_df$city) +# +# a_df$state<-ifelse(a_df$country=="australia", gsub("\\s[0-9]{4,5}", "", a_df$state),a_df$state) +# a_df$city<-ifelse(a_df$country=="australia", gsub("\\s[0-9]{4,5}", "", a_df$city),a_df$city) +# +# +# # Brazil ------------------------------------------------------------------ +# +# +# +# +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg","andar","empresas", +# "programa", "ciencias", "unidade", "lab ") +# +# a_df$department <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$department), +# NA, +# a_df$department) +# +# +# +# +# +# +# # Define the function +# extract_brl_postcodes <- function(df, source_col, target_col) { +# # 6 digits +# +# pattern <- "br-[0-9]{5,8}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# # df[i, brl_new_city] <- df[i, source_col] +# # df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# +# # df, source_col, target_col,city_col +# a_df <- extract_brl_postcodes(a_df, "department","postal_code") +# +# +# +# +# +# +# +# +# +# +# +# +# +# results <- extract_brl_postcodes(results, "brl_state","brl_pc") +# +# +# results$brl_new_city <- ifelse(is.na(results$brl_new_city), +# results$brl_state, +# results$brl_new_city) +# +# +# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, +# results$brl_city, +# results$brl_state) +# +# +# +# +# results$brl_new_city <- ifelse(is.na(results$brl_new_city), +# results$brl_city, +# results$brl_new_city) +# +# +# +# +# results$brl_city<-gsub("br-","",results$brl_city) +# results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) +# results$brl_city<-sub("-","",results$brl_city) +# +# +# results$brl_new_city<-gsub("br-","",results$brl_new_city) +# results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) +# results$brl_new_city<-sub("-","",results$brl_new_city) +# +# +# results$brl_pc<-gsub("br-","",results$brl_pc) +# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) +# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) +# +# results$brl_state<-gsub("br-","",results$brl_state) +# results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) +# results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) +# +# # +# # +# # +# # +# # +# # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) +# # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) +# # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) +# # # +# # results$brl_city<-gsub("-","",results$brl_city) +# # +# # results$brl_state<-gsub("br-","",results$brl_state) +# # results$brl_state<-gsub("-","",results$brl_state) +# +# +# +# +# +# # any without numbers gets NA'd +# results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA +# +# # keep portions with numbers / remove city names +# +# results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) +# results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) +# # +# # # Specific replacements +# # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), +# # "rio de janeiro", results$brl_city) +# +# results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), +# "rio de janeiro", results$brl_city) +# results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), +# "rj", results$brl_state) +# results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), +# "sp", results$brl_state) +# +# +# +# results$brl_city[results$brl_city==results$brl_state]<-NA +# +# +# # Clean up and adjust columns +# results[] <- lapply(results, trimws) +# +# +# # Define city-to-state mapping +# city_state_mapping <- data.frame( +# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), +# state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), +# stringsAsFactors = FALSE +# ) +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$city[i], results$brl_city) +# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brl_state) +# } +# +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# +# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brl_state) +# } +# +# +# +# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, +# results$brl_city, +# results$brl_state) +# +# +# results$brl_city <- trimws(results$brl_city, which = "both") +# results$brl_state <- trimws(results$brl_state, which = "both") +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg", +# "programa", "ciencias", "unidade", "lab ") +# +# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), +# results$brl_state, +# results$brl_city) +# +# +# # Final trimming +# results[] <- lapply(results, trimws) +# +# results$brl_city <- ifelse(results$brl_new_city==results$brl_city, +# NA, +# results$brl_city) +# diff --git a/R/authors_georef.R b/R/authors_georef.R index 1b31527..2c60f19 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -252,7 +252,8 @@ if (google_api == TRUE) { "city", "state", "country", "postal_code", "authorID" )] - + a_df$country[a_df$country=="could not be extracted"]<-NA + a_df$state[a_df$state=="no state"]<-NA a_df<-a_df[!is.na(a_df$country),] # select the following columns from the fll dataframe # a_df<-("authorID", "city","state","postal_code","country") @@ -289,8 +290,8 @@ a_df$addr <- ifelse(a_df$country == "usa", colnames(to_georef_df) <- c("addr") # to_georef_df <- na.omit(to_georef_df) - - # library(tidygeocoder) + # FOR TESTING ONLY: + # library(tidygeocoder) to_georef_df<-to_georef_df %>% sample_n(500) to_georef_df <- to_georef_df |> tidygeocoder::geocode(addr, method = "osm", lat = latitude, long = longitude From 5c509ef27f9470d53afd684b9ee9daaccd1abad1 Mon Sep 17 00:00:00 2001 From: embruna Date: Thu, 6 Feb 2025 09:56:41 -0500 Subject: [PATCH 10/34] updates --- R/authors_address.R | 94 -------------------------------------- R/authors_address_update.R | 26 ++++++++--- 2 files changed, 19 insertions(+), 101 deletions(-) diff --git a/R/authors_address.R b/R/authors_address.R index f8f096b..7c22e5b 100644 --- a/R/authors_address.R +++ b/R/authors_address.R @@ -100,100 +100,6 @@ authors_address <- function(addresses, ID){ a_df$country[grepl(us_reg, a_df$country)] <- "usa" - - # Added by eb - - # USA ZIP CODES - - a_df$state<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$second_tier), - a_df$second_tier,a_df$state) - a_df$postal_code<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$second_tier), - a_df$second_tier,a_df$postal_code) - a_df$city<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$second_tier), - a_df$third_tier,a_df$city) - - a_df$state<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$state), - gsub("[[:digit:]]{5}","",a_df$state),a_df$state) - - a_df$postal_code<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$postal_code), - gsub("[[:alpha:]]{2}","",a_df$postal_code),a_df$postal_code) - - - # BRAZIL CODES - - a_df$postal_code<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2}-[0-9]{4,8}", a_df$second_tier), - a_df$second_tier,a_df$postal_code) - a_df$city<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2}-[0-9]{4,8}", a_df$second_tier), - a_df$second_tier,a_df$city) - - a_df$postal_code<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2}-[0-9]{4,8}", a_df$third_tier), - a_df$third_tier,a_df$postal_code) - a_df$city<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2}-[0-9]{4,8}", a_df$third_tier), - a_df$third_tier,a_df$city) - - - - a_df$postal_code<- ifelse(a_df$country=="brazil", - gsub(" .*","",a_df$postal_code),a_df$postal_code) - - a_df$city<- ifelse(a_df$country=="brazil", - gsub(".*br-[0-9]+ ", "", a_df$city),a_df$city) - - - a_df$state<- ifelse(a_df$country=="brazil" & is.na(a_df$city) & is.na(a_df$postal_code), - a_df$second_tier,a_df$state) - - a_df$city<- ifelse(a_df$country=="brazil" & is.na(a_df$city) & is.na(a_df$postal_code), - a_df$third_tier,a_df$city) - - a_df$state<- ifelse(a_df$country=="brazil" & nchar(a_df$second_tier)==2,a_df$second_tier,a_df$state) - - - a_df$postal_code<- ifelse(a_df$country=="brazil", - gsub("br-", "", a_df$postal_code),a_df$postal_code) - - - a_df$state<- ifelse(a_df$country=="brazil" & grepl("[a-z]{2} [0-9]{5}", a_df$state), - gsub("[[:digit:]]{5}","",a_df$state),a_df$state) - - a_df$postal_code<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$postal_code), - gsub("[[:alpha:]]{2}","",a_df$postal_code),a_df$postal_code) - - - - # Define words indicating this is actually a dept not state or postal code - # will use this list to delete the ones that don't apply - to_check <- c("dept","ctr","inst","ppg","andar","empresas", - "cena","educ","programa", "ciencias", "unidade", "lab ") - - a_df$city <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$city), - a_df$second_tier, - a_df$city) - - - - a_df$state <- ifelse(a_df$country=="brazil" &grepl(paste(to_check, collapse = "|"), a_df$state), - a_df$second_tier,a_df$state) - - a_df$city<- ifelse(a_df$country=="brazil" & is.na(a_df$city) & is.na(a_df$postal_code), - a_df$second_tier,a_df$city) - - - a_df$city <- ifelse(grepl("museu nacl", a_df$city), - "rio de janeiro", a_df$city) - a_df$state <- ifelse(grepl("rio de janeiro", a_df$state, ignore.case = TRUE), - "rj", a_df$state) - a_df$state <- ifelse(grepl("sao paulo", a_df$state, ignore.case = TRUE), - "sp", a_df$state) - - - - results$brl_city[results$brl_city==results$brl_state]<-NA - - - # Clean up and adjust columns - results[] <- lapply(results, trimws) - ########################## # We'll use regular expression to pull zipcodes # These formats differ by region diff --git a/R/authors_address_update.R b/R/authors_address_update.R index d22f8b1..86ef819 100644 --- a/R/authors_address_update.R +++ b/R/authors_address_update.R @@ -631,7 +631,6 @@ authors_address <- function(addresses, ID){ a_df$city <- ifelse(a_df$country=="india" & (grepl("delhi", a_df$city)|grepl("delhi", a_df$state)), "new delhi", a_df$city) - # china ------------------------------------------------------------------- @@ -815,6 +814,11 @@ authors_address <- function(addresses, ID){ + # netherlands ------------------------------------------------------------- + # cities often have two characters at start (ascii version of ligature/dipthong) + a_df[] <- lapply(a_df, trimws) + a_df$city <- ifelse(a_df$country=="netherlands" & grepl("^[a-zA-Z]{2} ", a_df$city), + (sub("^[a-zA-Z]{2} ","", a_df$city)),a_df$city) @@ -831,6 +835,8 @@ authors_address <- function(addresses, ID){ + a_df[] <- lapply(a_df, trimws) + a_df$city<-ifelse(a_df$city=="university pk", "university park", a_df$city) @@ -902,20 +908,26 @@ authors_address <- function(addresses, ID){ "buenos aires",a_df$city) + a_df$city <- ifelse(grepl("^st ", a_df$city), + (sub("^st ","saint ", a_df$city)),a_df$city) + + a_df$city <- ifelse(grepl(" st ", a_df$city), + (sub(" st "," saint ", a_df$city)),a_df$city) + a_df$city <- ifelse(grepl("^ste ", a_df$city), + (sub("^ste ","saint ", a_df$city)),a_df$city) } - # usa cz = canal zone + # usa cz = canal zone, ada, apo, dpo # scotland mrc - # usa ada, apo, dpo - # st for saint (but cant change forest or west) - # netheerlands first two letters of city names xx name - # ste for saint - # + + + + # country<- a_df %>% # # mutate(city_match=(city==second_tier)) %>% # # filter(city_match==FALSE) %>% From 84326bf3dfe3f3ee36ff35e2266732b317473c7c Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 7 Mar 2025 15:03:29 -0500 Subject: [PATCH 11/34] added cleanup of japan and england --- R/authors_address_update.R | 183 ++++++++++++++++++++++++++++++++++++- 1 file changed, 178 insertions(+), 5 deletions(-) diff --git a/R/authors_address_update.R b/R/authors_address_update.R index 86ef819..7f6306e 100644 --- a/R/authors_address_update.R +++ b/R/authors_address_update.R @@ -19,6 +19,8 @@ authors_address <- function(addresses, ID){ message("\nSplitting addresses\n") list_address <- strsplit(addresses, ",") + + # remove punctuation ---------------------------------------- ## First remove periods and trim white space from countries. ## helps avoids mistakes later on @@ -43,6 +45,7 @@ authors_address <- function(addresses, ID){ # correct countries ------------------------------------------------------- + # correct names # Define the function correct_countries <- function(my_list, replacements) { @@ -55,7 +58,9 @@ authors_address <- function(addresses, ID){ if(len > 0 && my_list[[i]][len] %in% names(replacements)) { # Replace the last item with the corresponding replacement word my_list[[i]][len] <- replacements[[my_list[[i]][len]]] - } + + } + } return(my_list) } @@ -245,7 +250,6 @@ authors_address <- function(addresses, ID){ - # any PC without numbers gets NA'd a_df$postal_code[!grepl("\\d", a_df$postal_code)] <- NA @@ -772,7 +776,7 @@ authors_address <- function(addresses, ID){ -# final postal codes - consecutinve numbers ------------------------------- +# final postal codes - consecutive numbers ------------------------------- # Define the function extract_consecutive_numbers <- function(df, source, destination) { df[[destination]] <- sapply(1:nrow(df), function(i) { @@ -837,6 +841,19 @@ authors_address <- function(addresses, ID){ a_df[] <- lapply(a_df, trimws) + # Remove panama canal zone from usa states (for stri) + a_df$state<-ifelse((a_df$country=="usa" & a_df$state=="cz"), + NA,a_df$state) + # armed forces & diplomatic + a_df$state<-ifelse((a_df$country=="usa" & a_df$state=="aa"), + NA,a_df$state) + + a_df$city<-ifelse((a_df$country=="usa" & a_df$state=="apo"), + NA,a_df$city) + + a_df$city<-ifelse((a_df$country=="usa" & a_df$state=="dpo"), + NA,a_df$city) + a_df$city<-ifelse(a_df$city=="university pk", "university park", a_df$city) @@ -891,6 +908,7 @@ authors_address <- function(addresses, ID){ a_df$city<-ifelse((a_df$country=="brazil" & (a_df$city == "rio de janerio"| a_df$city == "rio de janiero"| a_df$city == "rio der janeiro"| + a_df$city == "rio janeiro"| a_df$city == "rio janiero")), "rio de janeiro", a_df$city) @@ -917,13 +935,168 @@ authors_address <- function(addresses, ID){ a_df$city <- ifelse(grepl("^ste ", a_df$city), (sub("^ste ","saint ", a_df$city)),a_df$city) + + a_df$city <- ifelse(grepl("sioux ctr", a_df$city), + (sub("sioux ctr","sioux city", a_df$city)),a_df$city) + + + tech_words <- c(" lab ", "lab "," lab", "dept", "hosp", " inst","inst ", "ctr", + "unit", "ltd", "minist", "educ", "grad ", " sch ","sch "," sch", + "coll ", " sci ", "natl", "&", " med","med ", + "publ", "dept", "biomed", "phys", "technol", + "engn") + pattern <- paste(tech_words, collapse = "|") + + a_df$city<- ifelse((a_df$city!="esch sur alzette" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + a_df$state, a_df$city) + + + a_df$state<- ifelse(a_df$state==a_df$city2,NA,a_df$state) + + + a_df$state<- ifelse(grepl("[[:digit:]]",a_df$state), + NA, a_df$state) + + + + a_df$state<-ifelse(a_df$state=="",NA,a_df$state) + a_df$postal_code<-ifelse(a_df$postal_code=="",NA,a_df$postal_code) + + us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", + "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", + "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", + "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", + "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") + pattern <- paste(us_state_abbreviations_lower, collapse = "|") + a_df$country_list<-country_list + a_df$state<- ifelse((a_df$country=="usa" & is.na(a_df$state) & grepl(pattern, a_df$address, ignore.case = TRUE, perl = TRUE)), + a_df$country_list,a_df$state) + + + + a_df$state<- ifelse((a_df$country=="usa" & grepl("[[:digit:]]",a_df$state)), + gsub("[[:digit:]]","", a_df$state),a_df$state) + a_df$state<- ifelse((a_df$country=="usa" & grepl("usa",a_df$state)), + gsub("usa","", a_df$state),a_df$state) + a_df$state<- trimws(a_df$state, which = "both") + + + # fixing japanese prefectures and cities, which are sometimes swapped. + + + to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program","minist", "educ", "sch ", "grad ", "fac ", + "assoc","forest", "corp") + pattern <- paste(to_delete, collapse = "|") + a_df$city2<- ifelse((a_df$country=="japan" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA,a_df$city2) + + # Remove any with numbers + a_df$city2<- ifelse((a_df$country=="japan" & grepl("[[:digit:]]", a_df$city2)), + NA,a_df$city2) + + + + japan_prefectures <- c("hokkaido","aomori", "iwate","miyagi","akita", + "yamagata","fukushima","ibaraki","tochigi","gunma", + "saitama","chiba","tokyo","kanagawa","niigata", + "toyama","ishikawa","fukui","yamanashi","nagano","gifu", + "shizuoka","aichi","mie","shiga","kyoto","osaka","gumma", + "hyogo","nara","wakayama","tottori","shimane", + "okayama","hiroshima","yamaguchi","tokushima","kagawa", + "ehime","kochi","fukuoka","saga","nagasaki","kumamoto", + "oita","miyazaki","kagoshima","okinawa") + pattern <- paste(japan_prefectures, collapse = "|") + + + + a_df$state<- ifelse((a_df$country=="japan" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + a_df$city,a_df$state) + + + # This removes all special regions of a city like tokyo from city2 + a_df$city2<- ifelse((a_df$country=="japan" & grepl(" ku", a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA,a_df$city2) + + # replace from city with city2 EXCEPT in cases where no state (and therefore + # city is correct) and where no city2 (otherwise would bring in NA) + + a_df$city<- ifelse((a_df$country=="japan" & !(is.na(a_df$state)) & !(is.na(a_df$city2))), + a_df$city2,a_df$city) + + + + # Fix Scotland + + a_df$city<- ifelse((a_df$country=="scotland" & grepl("univ ", a_df$city, ignore.case = TRUE, perl = TRUE)), + gsub("univ ","",a_df$city),a_df$city) + + + + to_delete <- c(" ave", " grp", "hlth", " rd", "mrc"," oba","plz", + " dr", "oqb", " quad","fisheries") + + pattern <- paste(to_delete, collapse = "|") + a_df$city<- ifelse((a_df$country=="scotland" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + NA,a_df$city) + + + # Fix UK + to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program","minist", "educ", "sch ", "grad ", "fac ", + " sq", "quarter", " way", " dr", "diagnost", "consultant", + "microsoft","diagnost", "[[:digit:]]","project","facil", "grp", + "campus","expt"," pk", "canc","assoc","forest", "corp", + "consortium", "partners", "lane","ucl","street","trust", + "business", "inform", "royal","survey","drosophila", " st", + "ndorms", "nat hist", "hlth", " ave","council", "unit", "nerc", "nat res") + pattern <- paste(to_delete, collapse = "|") + a_df$city2<- ifelse((a_df$country=="england" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA,a_df$city2) + + a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)), + a_df$city2,a_df$city) + + + a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & + grepl("london", a_df$address, ignore.case = TRUE, perl = TRUE), + "london",a_df$city) + + a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & + grepl("cambridge", a_df$address, ignore.case = TRUE, perl = TRUE), + "cambridge",a_df$city) + + a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & + grepl("oxford", a_df$address, ignore.case = TRUE, perl = TRUE), + "oxford",a_df$city) + + a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & + grepl("durham", a_df$address, ignore.case = TRUE, perl = TRUE), + "durham",a_df$city) + + a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & + grepl("bristol", a_df$address, ignore.case = TRUE, perl = TRUE), + "bristol",a_df$city) + + a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & + grepl("lancaster", a_df$address, ignore.case = TRUE, perl = TRUE), + "lancaster",a_df$city) + + + + a_df$city2 <- NULL + a_df$country_list<-NULL + + } - # usa cz = canal zone, ada, apo, dpo - # scotland mrc + + From 3b0aa5d42e32cbbfa800b070afd914808a9fbc18 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 7 Mar 2025 15:27:07 -0500 Subject: [PATCH 12/34] remove big authors_address update from this branch --- R/authors_address_update.R | 3618 ------------------------------------ 1 file changed, 3618 deletions(-) delete mode 100644 R/authors_address_update.R diff --git a/R/authors_address_update.R b/R/authors_address_update.R deleted file mode 100644 index 7f6306e..0000000 --- a/R/authors_address_update.R +++ /dev/null @@ -1,3618 +0,0 @@ -#' Parses out address information and splits it into its respective parts. -#' This is an internal function used by \code{authors_clean} -#' -#' \code{authors_address} This function takes the output from -#' \code{references_read} and pulls out address information. Splitting it into -#' university, department, city, state, etc. -#' @param addresses the addresses -#' @param ID the authorID -#' @noRd -authors_address <- function(addresses, ID){ - - # DELETE - # library(tidyverse) - # library(refsplitr) - final<-read.csv("./data/wos_txt/final.csv") - addresses<-final$address - ID<-final$authorID - addresses<-tolower(addresses) - message("\nSplitting addresses\n") - list_address <- strsplit(addresses, ",") - - - - # remove punctuation ---------------------------------------- - ## First remove periods and trim white space from countries. - ## helps avoids mistakes later on - - remove_period_from_last <- function(list_address) { - lapply(list_address, function(x) { - if (length(x) > 0) { - x[length(x)] <- gsub("\\.$", "", x[length(x)]) - x[length(x)] <- trimws(x[length(x)], which = "both") - } - return(x) - }) - } - - list_address <- remove_period_from_last(list_address) - -# trim ws ----------------------------------------------------------------- - - list_address <- lapply(list_address, trimws) - - - -# correct countries ------------------------------------------------------- - - - # correct names - # Define the function - correct_countries <- function(my_list, replacements) { - # Loop through each element of the list - for(i in 1:length(my_list)) { - # Get the length of the current element - len <- length(my_list[[i]]) - - # Check if the last item matches any of the target words - if(len > 0 && my_list[[i]][len] %in% names(replacements)) { - # Replace the last item with the corresponding replacement word - my_list[[i]][len] <- replacements[[my_list[[i]][len]]] - - } - - } - return(my_list) - } - # czechia = new name for czech republic - # TBD: united arab rep, - replacements <- c("austl" = "australia", - "c z" = "czechia", - "cz" = "czechia", - "czech republic" = "czechia", - "fed rep ger" = "germany", - "columbia" = "colombia", - "peoples r china" = "china", - "u arab emirates" = "united arab emirates", - "mongol peo rep" = "mongolia", - "dominican rep" = "dominican republic", - "fr polynesia" = "french polynesia", - "neth antilles" = "netherland antilles", - "trinid & tobago" = "trinidad & tobago", - "rep congo" = "congo", - "north ireland" = "northern ireland", - "syrian arab rep" = "syria" - ) - - - list_address<- correct_countries(list_address, replacements) - # a_df$country <- correct_countries(a_df$country, replacements) - - -# extract university ------------------------------------------------------ - - - university_list <- vapply(list_address, function(x) x[1], character(1)) - -# extract department ------------------------------------------------------ - - # # If department is listed it is typically second - # # this will be 2x checked later - # ## EB: seond only if 4+ slots - dept_extract <- function(x) { - if (length(x) < 4) { - return(NA) - } else { - return(trimws(x[[2]])) - } - } - # - dept_list <- unlist(lapply(list_address, dept_extract)) - - # dept_list <- vapply(list_address, function(x) x[2], character(1)) - dept_list <- trimws(dept_list, which = "both") - - - - # Extract City ------------------------------------------------------------ - - # If there is only one element, then it can't have both city and country' - city_list <- vapply(list_address, function(x) { - n <- length(x) - if (n == 1) { - return("no city") # placeholder to replace with NA after function - } - - # In some cases city is next-to-last element, in others next-to-next-to-last - last_element <- x[[n]] - second_last <- if (n > 1) x[[n - 1]] else NA - third_last <- if (n > 2) x[[n - 2]] else NA - - # Check for India, China, & brazil, canada, australia, and UK. - # These countries' city is in multiple places - # This puts ina placeholder, which will be replaced later in - # the function that checks India and China - # if(last_element %in% c("india", "china", - # "brazil", "canada", "australia", - # "scotland", "england", "wales", - # "northern ireland")) - # { - # return("icb") # placeholder to replace with NA after function - # } - # - # - # if (grepl("usa",last_element)) { - # return("icb") # placeholder to replace with NA after function - # } - - # And of course a few other odd ones. This will check for - # other countries with specific rules. - # if ((last_element == "australia" && second_last != "liverpool") || - # last_element %in% c("wales") || - # (last_element == "mexico" && second_last != "iztapalapa") || - # (last_element == "argentina" && second_last == "df")) { - # return(third_last) - # } - - # Default case - return(second_last) - }, character(1)) - - # Cleanup - city_list <- trimws(city_list, which = "both") - city_list[city_list == "no city"] <- NA - city_list[city_list == "icb"] <- NA - - -# extract state ----------------------------------------------------------- - - # Extract City ------------------------------------------------------------ - - # If there is only one element, then it can't have both city and country' - state_list <- vapply(list_address, function(x) { - n <- length(x) - if (n == 1) { - return("no state") # placeholder to replace with NA after function - } - - # In some cases city is next-to-last element, in others next-to-next-to-last - last_element <- x[[n]] - second_last <- if (n > 1) x[[n - 1]] else NA - third_last <- if (n > 2) x[[n - 2]] else NA - - # Check for India, China, & brazil, canada, australia, and UK. - # These countries' city is in multiple places - # This puts ina placeholder, which will be replaced later in - # the function that checks India and China - # if(last_element %in% c("india", "china", - # "brazil", "canada", "australia", - # "scotland", "england", "wales", - # "northern ireland")) - # { - # return("icb") # placeholder to replace with NA after function - # } - # - # - # if (grepl("usa",last_element)) { - # return("icb") # placeholder to replace with NA after function - # } - - # And of course a few other odd ones. This will check for - # other countries with specific rules. - # if ((last_element == "australia" && second_last != "liverpool") || - # last_element %in% c("wales") || - # (last_element == "mexico" && second_last != "iztapalapa") || - # (last_element == "argentina" && second_last == "df")) { - # return(third_last) - # } - - # Default case - return(third_last) - }, character(1)) - - # Cleanup - state_list <- trimws(state_list, which = "both") - state_list[state_list == "no city"] <- NA - state_list[state_list == "icb"] <- NA - - - city_list2 <- trimws(state_list, which = "both") - # Extract Country --------------------------------------------------------- - - - country_list <- vapply(list_address, function(x) { - gsub("\\_", "", x[length(x)]) }, - character(1)) - - - -# pc list ----------------------------------------------------------------- - -# pc often with city - - pc_list<-city_list - -# bind into df ------------------------------------------------------------ - - - a_df <- data.frame( - adID = ID, - university = university_list, - country = country_list, - state = state_list, - postal_code = pc_list, - city = city_list, - city2 = city_list2, - department = dept_list, - address = addresses, - stringsAsFactors = FALSE - ) - - - - # any PC without numbers gets NA'd - a_df$postal_code[!grepl("\\d", a_df$postal_code)] <- NA - - # copy over PC and state - a_df$state<- ifelse(grepl("usa",a_df$country) & nchar(a_df$state)>2, - NA, - a_df$state) - - - a_df$postal_code<- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), - a_df$country,a_df$postal_code) - - a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), - a_df$country,a_df$state) - - a_df$state<- ifelse(grepl("[a-z]{2} [0-9]{5}", a_df$city), - a_df$city,a_df$state) - - - a_df$state<- ifelse(grepl("[a-z]{2} usa", a_df$country), - a_df$country,a_df$state) - - # remove the numbers and letters as appropriate - - - a_df$country<- ifelse(grepl(" usa", a_df$country), - "usa",a_df$country) - - a_df$state<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$state), - gsub("[[:digit:]]{5}","",a_df$state),a_df$state) - - a_df$state<- ifelse(a_df$country=="usa" & grepl(" usa", a_df$state), - gsub(" usa","",a_df$state),a_df$state) - - - a_df$postal_code<- ifelse(a_df$country=="usa", - gsub("[[:alpha:]]{2} ","", - a_df$postal_code),a_df$postal_code) - - a_df$postal_code<- ifelse(a_df$country=="usa", - gsub(" usa","", - a_df$postal_code),a_df$postal_code) - - - - - a_df$city<- ifelse(a_df$country=="usa" & grepl("[a-z]{2} [0-9]{5}", a_df$city), - a_df$city2,a_df$city) - - - pattern <- "[a-z]{2} [0-9]{5}" - - a_df$postal_code<- ifelse(grepl(pattern, a_df$country), - a_df$country,a_df$postal_code) - a_df$state<- ifelse(grepl(pattern, a_df$country), - a_df$country,a_df$state) - a_df$country<- ifelse(grepl(pattern, a_df$country), - "usa",a_df$country) - a_df$postal_code<- ifelse(a_df$country=="usa" & grepl(pattern, a_df$postal_code), - gsub("[a-z]","",a_df$postal_code),a_df$postal_code) - a_df$state<- ifelse(a_df$country=="usa" & grepl(pattern, a_df$state), - gsub("[0-9]","",a_df$postal_code),a_df$state) - - - #TODO: correct this to catch any that didn;t get caught - - # - # us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", - # "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", - # "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", - # "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", - # "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") - # - # country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) - - - -# brazil clean-up --------------------------------------------------------- - - a_df$state<- ifelse(a_df$country=="brazil" & nchar(a_df$city)==2, - a_df$city,a_df$state) - a_df$city<- ifelse(a_df$country=="brazil" & nchar(a_df$city)==2, - a_df$city2,a_df$city) - a_df$city2<- ifelse(a_df$country=="brazil" & a_df$city==a_df$city2, - NA,a_df$city2) - a_df$postal_code<- ifelse(a_df$country=="brazil" & is.na(a_df$postal_code), - a_df$city,a_df$postal_code) - a_df$state<- ifelse(a_df$country=="brazil" & nchar(a_df$state)>2, - NA,a_df$state) - - - a_df$postal_code<- ifelse(a_df$country=="brazil", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code) - a_df$postal_code<- ifelse(a_df$country=="brazil", - gsub("[-]", "", a_df$postal_code), - a_df$postal_code) - - a_df$city<- ifelse(a_df$country=="brazil", - gsub("br-", "", a_df$city), - a_df$city) - a_df$city<- ifelse(a_df$country=="brazil", - gsub("[0-9]", "", a_df$city), - a_df$city) - - - a_df$state<- ifelse(a_df$country=="brazil" & grepl("br-", a_df$city2), - a_df$city,a_df$state) - a_df$postal_code<- ifelse(a_df$country=="brazil" & grepl("br-", a_df$city2), - a_df$city2,a_df$postal_code) - a_df$city<- ifelse(a_df$country=="brazil" & grepl("br-", a_df$city2), - a_df$city2,a_df$city) - - - # repeat the clean of city - a_df$city<- ifelse(a_df$country=="brazil", - gsub("br-", "", a_df$city), - a_df$city) - a_df$city<- ifelse(a_df$country=="brazil", - gsub("[0-9]", "", a_df$city), - a_df$city) - - - a_df$postal_code<- ifelse(a_df$country=="brazil", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code) - a_df$postal_code<- ifelse(a_df$country=="brazil", - gsub("[-]", "", a_df$postal_code), - a_df$postal_code) - - a_df[] <- lapply(a_df, trimws) - - - - # # Define words indicating this is actually a dept not state or postal code - # # will use this list to delete the ones that don't apply - # to_check <- c("dept","ctr","inst","ppg","andar","empresas", - # "programa", "ciencias", "unidade", "lab ") - # - # a_df$city <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$city), - # a_df$state, - # results$brl_city) - # - # - # - # results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), - # results$brl_city, - # results$brl_state) - - # - # results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), - # "rio de janeiro", results$brl_city) - # results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), - # "rj", results$brl_state) - # results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), - # "sp", results$brl_state) - # - - - # results$brl_city[results$brl_city==results$brl_state]<-NA - - - # Clean up and adjust columns - # results[] <- lapply(results, trimws) - - - # Define city-to-state mapping - city_state_mapping <- data.frame( - city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), - state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), - stringsAsFactors = FALSE - ) - - # Match cities and states - for (i in 1:nrow(city_state_mapping)) { - a_df$city <- ifelse(a_df$country=="brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), - city_state_mapping$city[i], a_df$city) - a_df$state <- ifelse(a_df$country=="brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), - city_state_mapping$state[i], a_df$state) - } - - # Match cities and states - for (i in 1:nrow(city_state_mapping)) { - a_df$state <- ifelse(a_df$country=="brazil" & grepl(city_state_mapping$city[i], a_df$city, ignore.case = TRUE), - city_state_mapping$state[i], a_df$state) - } - - - - # brazil_delete<- c("barretos canc hosp","univ fed sao paulo", - # "escola filosofia letras & ciencias humanas", - # "hosp sirio libanes","perola byington hosp", - # "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", - # "lab","zoologia", "inst", "programa","ppg", "ppg") - - - # city_clean$city_list <- ifelse(city_clean$country_list=="brazil", - # gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), - # city_clean$city_list) - - # AUSTRALIA --------------------------------------------------------------- - - a_df$state<- ifelse(a_df$country=="australia", - a_df$city,a_df$state) - a_df$postal_code<- ifelse(a_df$country=="australia", - a_df$city,a_df$postal_code) - a_df$city<- ifelse(a_df$country=="australia", - a_df$city2,a_df$city) - a_df$city2<- ifelse(a_df$country=="australia", - NA,a_df$city2) - - - - - - - a_df$postal_code<- ifelse(a_df$country=="australia", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code) - a_df$state<- ifelse(a_df$country=="australia", - gsub("[0-9]", "", a_df$state), - a_df$state) - - a_df[] <- lapply(a_df, trimws) - - -# canada ------------------------------------------------------------------ - - - - a_df$state<- ifelse(a_df$country=="canada" & nchar(a_df$city)==2, - a_df$city,a_df$state) - - a_df$city<- ifelse(a_df$country=="canada" & nchar(a_df$city)==2, - NA,a_df$city) - - a_df$postal_code<- ifelse(a_df$country=="canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), - a_df$city2,a_df$postal_code) - a_df$postal_code<- ifelse(a_df$country=="canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), - a_df$city2,a_df$postal_code) - - a_df$state<- ifelse(a_df$country=="canada" & a_df$city2==a_df$state, - NA,a_df$state) - - a_df$city<- ifelse(a_df$country=="canada",a_df$city2,a_df$city) - a_df$city2<- ifelse(a_df$country=="canada",NA,a_df$city2) - - - a_df$city<- ifelse(a_df$country=="canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city), - gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b","", a_df$city), - a_df$city) - - a_df$state<- ifelse(a_df$country=="canada" & is.na(a_df$state), - a_df$postal_code, - a_df$state) - - a_df$state<- ifelse(a_df$country=="canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$state), - gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b","", a_df$state), - a_df$state) - - a_df$postal_code <- ifelse(a_df$country=="canada" & grepl("\\b(\\w{2,20})\\b \\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$postal_code), - gsub("\\b(\\w{1,2}|\\w{4,})\\b", "", a_df$postal_code), - a_df$postal_code) - - a_df[] <- lapply(a_df, trimws) - - #TODO: a few postal codes still have letters from city - - - a_df$postal_code <- ifelse(a_df$country=="canada", gsub(" ","",a_df$postal_code),a_df$postal_code) - -# UK ---------------------------------------------------------------------- - - - - uk<- c("scotland", "england", "wales","northern ireland") - pattern <- "[a-z0-9]{2,4} [a-z0-9]{3,4}" - # - # a_df$postal_code <- ifelse(a_df$country %in% uk & - # grepl(pattern, a_df$city2),a_df$city2, - # a_df$postal_code) - - a_df$postal_code<- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city2), - a_df$city2,a_df$postal_code) - a_df$postal_code<- ifelse(a_df$country %in% uk & grepl(pattern, a_df$state), - a_df$state,a_df$postal_code) - a_df$postal_code<- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city), - a_df$city,a_df$postal_code) - - a_df$postal_code <- ifelse(a_df$country %in% uk, - ifelse(!grepl("\\d", a_df$postal_code), NA, a_df$postal_code), - a_df$postal_code) - - a_df$city<- ifelse(a_df$country %in% uk & a_df$city==a_df$postal_code, - NA,a_df$city) - - a_df$state<- ifelse(a_df$country %in% uk & a_df$state==a_df$postal_code, - NA,a_df$state) - - - a_df$state<- ifelse(a_df$country=="england",a_df$city,a_df$state) - a_df$city<- ifelse(a_df$country=="england",NA,a_df$city) - a_df$city<- ifelse(a_df$country=="england",a_df$postal_code,a_df$city) - a_df$city<- ifelse(a_df$country=="england", - gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), - a_df$city) - - #TODO: england still needs work - - a_df$state<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales" ,NA,a_df$state) - a_df$state<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales" & is.na(a_df$state),a_df$city,a_df$state) - a_df$city<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales",a_df$postal_code,a_df$city) - a_df$city<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales" & is.na(a_df$city),a_df$city2,a_df$city) - a_df$city<- ifelse(a_df$country=="scotland"|a_df$country=="northern ireland"|a_df$country=="wales", - gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), - a_df$city) - - -# postal codes clean uk --------------------------------------------------- - - - # Define the function - keep_numerical_parts <- function(df, control_col, country, target_col) { - # Apply the function to each row using sapply or a loop - df[[target_col]] <- sapply(1:nrow(df), function(i) { - if (df[[control_col]][i] == country) { - # Use gregexpr to find all parts of the string that include a numeral - matches <- gregexpr("\\b\\S*\\d\\S*\\b", df[[target_col]][i]) - # Extract the matched parts - result <- regmatches(df[[target_col]][i], matches) - # Combine the matched parts into a single string - result <- unlist(result) - result <- paste(result, collapse = " ") - result <- gsub(" ", "",result) - return(result) - } else { - return(df[[target_col]][i]) - } - }) - - return(df) - } - - - a_df <- keep_numerical_parts(a_df, "country","scotland", "postal_code") - a_df <- keep_numerical_parts(a_df, "country","england", "postal_code") - a_df <- keep_numerical_parts(a_df, "country","northern ireland", "postal_code") - a_df <- keep_numerical_parts(a_df, "country","wales", "postal_code") - - - - - - -# india ------------------------------------------------------------------ - - - a_df$postal_code<- ifelse(a_df$country=="india" & grepl("[0-9]{5,10}", a_df$city2), - a_df$city2,a_df$postal_code) - a_df$postal_code<- ifelse(a_df$country=="india" & grepl("[0-9]{5,10}", a_df$city), - a_df$city,a_df$postal_code) - - - a_df$city2<- ifelse(a_df$country=="india" & a_df$state==a_df$city2, - a_df$state,a_df$city2) - a_df$state<- ifelse(a_df$country=="india",NA,a_df$state) - a_df$state<- ifelse(a_df$country=="india" & is.na(a_df$postal_code), - a_df$city, a_df$state) - a_df$city<- ifelse(a_df$country=="india" & a_df$state==a_df$city, - NA, a_df$city) - a_df$city<- ifelse(a_df$country=="india" & grepl("[0-9]{4,10}", a_df$postal_code), - a_df$postal_code,a_df$city) - a_df$city<- ifelse(a_df$country=="india" & is.na(a_df$city), - a_df$city2, a_df$city) - - - a_df$postal_code<- ifelse(a_df$country=="india", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code) - a_df$city<- ifelse(a_df$country=="india", - gsub("[0-9]", "", a_df$city), - a_df$city) - - a_df$city <- ifelse(a_df$country=="india" & (grepl("delhi", a_df$city)|grepl("delhi", a_df$state)), - "new delhi", a_df$city) - -# china ------------------------------------------------------------------- - - - - a_df$postal_code<- ifelse(a_df$country=="china" & grepl("[0-9]{5,10}", a_df$city2), - a_df$city2,a_df$postal_code) - a_df$postal_code<- ifelse(a_df$country=="china" & grepl("[0-9]{5,10}", a_df$city), - a_df$city,a_df$postal_code) - a_df$postal_code<- ifelse(a_df$country=="china" & grepl("[0-9]{5,10}", a_df$state), - a_df$state,a_df$postal_code) - - - a_df$city2<- ifelse(a_df$country=="china" & a_df$state==a_df$city2, - a_df$state,a_df$city2) - a_df$state<- ifelse(a_df$country=="china",NA,a_df$state) - a_df$state<- ifelse(a_df$country=="china" & is.na(a_df$postal_code), - a_df$city, a_df$state) - a_df$city<- ifelse(a_df$country=="china" & a_df$state==a_df$city, - NA, a_df$city) - a_df$city<- ifelse(a_df$country=="china" & grepl("[0-9]{4,10}", a_df$postal_code), - a_df$postal_code,a_df$city) - a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), - a_df$city2, a_df$city) - - - a_df$postal_code<- ifelse(a_df$country=="china", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code) - a_df$city<- ifelse(a_df$country=="china", - gsub("[0-9]", "", a_df$city), - a_df$city) - - - - a_df$city<- ifelse(a_df$country=="china" & grepl("beijing", a_df$state), - "beijing",a_df$city) - - - # Define words indicating this is actually a dept not state or postal code - # will use this list to delete the ones that don't apply - to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", - "dept", "div", "univ", "hosp", "coll", "sci", "rd", - "program","minist", "educ", "sch ", "grad ", "fac ", - "assoc","forest") - - - pattern <- paste(to_delete, collapse = "|") - # Apply the ifelse function to update - # a_df$city <- ifelse(a_df$country == "china" & - # grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), - # NA, a_df$city) - # - # - # a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), - # a_df$state, a_df$city) - # - - - a_df[] <- lapply(a_df, trimws) - # Clean chn_pc1 and chn_pc2 - - - - # TODO: check this, allows verifying iof what is in state is actually the city - - # chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", - # "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", - # "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", - # "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", - # "gansu", "inner mongolia", "jilin", "hainan", "ningxia", - # "qinghai", "tibet", "macao") - # pattern <- paste(to_delete, collapse = "|") - # a_df$state<- ifelse(a_df$country=="china" & - # grepl(pattern, a_df$state, ignore.case = TRUE, perl = TRUE), - # NA, a_df$state) - # TODO: fix. not necessary but useful. - # All the cities in the addresses, add as needed. - # chn_cities <- unique(c((a_df$country=="china" & a_df$city), "lhasa")) - - - -# pc is letters dash numbers ---------------------------------------------- - - - pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" - - a_df$postal_code<- ifelse(grepl(pattern, a_df$city), - a_df$city,a_df$postal_code) - - a_df$postal_code<- ifelse(grepl(pattern, a_df$state), - a_df$state,a_df$postal_code) - - - # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city==a_df$postal_code), - # a_df$postal_code,a_df$state) - - a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$postal_code), - a_df$city,a_df$state) - - - a_df$city<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$postal_code), - a_df$postal_code,a_df$city) - - a_df$city2<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$city), - NA,a_df$city2) - - # # - # # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$state), - # # NA,a_df$state) - # # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city==a_df$state), - # # NA,a_df$state) - # # - # a_df$city<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$city), - # a_df$city2,a_df$city) - - - a_df$city<- ifelse(grepl(pattern, a_df$city), - gsub("[0-9]","",a_df$city), - a_df$city) - a_df$city<- gsub("[a-z]{1,2}- ","", a_df$city) - - - # a_df$postal_code<- gsub("[a-z]","", a_df$postal_code) - - a_df$city<- gsub("[-]","", a_df$city) - a_df[] <- lapply(a_df, trimws) - - - pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" - - a_df$postal_code<- ifelse(grepl(pattern, a_df$postal_code), - gsub("[a-z]","",a_df$postal_code), - a_df$postal_code) - - a_df$postal_code<- gsub("[-]","", a_df$postal_code) - a_df[] <- lapply(a_df, trimws) - - - - -# final postal codes - consecutive numbers ------------------------------- - # Define the function - extract_consecutive_numbers <- function(df, source, destination) { - df[[destination]] <- sapply(1:nrow(df), function(i) { - # Use gregexpr to find sequences of 4 or more consecutive numbers - if (is.na(df[[destination]][i])) { - matches <- gregexpr("\\d{4,}", df[[source]][i]) - # Extract the matched sequences - result <- regmatches(df[[source]][i], matches) - # Flatten the list of matches into a character vector - result <- unlist(result) - # Combine the matched sequences into a single string - result <- paste(result, collapse = " ") - return(result) - } else { - return(df[[destination]][i]) - } - }) - return(df) - } - - a_df <- extract_consecutive_numbers(a_df, "state","postal_code") - - - -# clean the city ---------------------------------------------------------- - - - - a_df$city<- gsub("[0-9]","", a_df$city) - - - -# clean up postal code ---------------------------------------------------- - - - a_df$postal_code<- ifelse(grepl("\\b[a-zA-Z]+\\s+[0-9]+\\b", a_df$postal_code), - gsub("\\b[a-zA-Z]+\\s","", a_df$postal_code), - a_df$postal_code) - - - - # netherlands ------------------------------------------------------------- - # cities often have two characters at start (ascii version of ligature/dipthong) - a_df[] <- lapply(a_df, trimws) - a_df$city <- ifelse(a_df$country=="netherlands" & grepl("^[a-zA-Z]{2} ", a_df$city), - (sub("^[a-zA-Z]{2} ","", a_df$city)),a_df$city) - - - - - - - - - - - - - - - - - a_df[] <- lapply(a_df, trimws) - - # Remove panama canal zone from usa states (for stri) - a_df$state<-ifelse((a_df$country=="usa" & a_df$state=="cz"), - NA,a_df$state) - # armed forces & diplomatic - a_df$state<-ifelse((a_df$country=="usa" & a_df$state=="aa"), - NA,a_df$state) - - a_df$city<-ifelse((a_df$country=="usa" & a_df$state=="apo"), - NA,a_df$city) - - a_df$city<-ifelse((a_df$country=="usa" & a_df$state=="dpo"), - NA,a_df$city) - - a_df$city<-ifelse(a_df$city=="university pk", - "university park", - a_df$city) - - - a_df$city<-ifelse(a_df$city=="gavea rio de janeiro", - "rio de janeiro", - a_df$city) - - - a_df$city<-ifelse(a_df$city=="college stn", - "college station", - a_df$city) - - a_df$city<-ifelse(a_df$city=="n chicago", - "north chicago", - a_df$city) - - a_df$city<-ifelse(a_df$city=="college pk", - "college park", - a_df$city) - - a_df$city<-ifelse(a_df$city=="research triangle pk" | a_df$city=="res triangle pk", - "research triangle park", - a_df$city) - - a_df$city<-ifelse(a_df$city=="state coll", - "state college", - a_df$city) - - # city corrections - a_df$city<-ifelse((a_df$city=="dehra dun" & a_df$country == "india"), - "dehradun", - a_df$city) - - a_df$city<-ifelse((a_df$city=="st john" & a_df$country == "canada"), - "st. john's", - a_df$city) - - a_df$state<-ifelse(a_df$state=="london ", - "london", - a_df$state) - - a_df$city<-ifelse((a_df$state=="london" & a_df$country == "england"), - "london", - a_df$city) - - a_df$city<-ifelse((a_df$country=="brazil" & a_df$city == "s jose campos"), - "sao jose dos campos", - a_df$city) - - a_df$city<-ifelse((a_df$country=="brazil" & (a_df$city == "rio de janerio"| - a_df$city == "rio de janiero"| - a_df$city == "rio der janeiro"| - a_df$city == "rio janeiro"| - a_df$city == "rio janiero")), - "rio de janeiro", - a_df$city) - # - # a_df$state<-ifelse(a_df$state=="london", - # NA, - # a_df$state) - - - a_df$city<-ifelse(a_df$country=="mexico" & a_df$city == "df", - "mexico city", - a_df$city) - - a_df$city<-ifelse(a_df$country=="argentina" & a_df$city == "df", - "buenos aires",a_df$city) - - - a_df$city <- ifelse(grepl("^st ", a_df$city), - (sub("^st ","saint ", a_df$city)),a_df$city) - - a_df$city <- ifelse(grepl(" st ", a_df$city), - (sub(" st "," saint ", a_df$city)),a_df$city) - - a_df$city <- ifelse(grepl("^ste ", a_df$city), - (sub("^ste ","saint ", a_df$city)),a_df$city) - - - a_df$city <- ifelse(grepl("sioux ctr", a_df$city), - (sub("sioux ctr","sioux city", a_df$city)),a_df$city) - - - tech_words <- c(" lab ", "lab "," lab", "dept", "hosp", " inst","inst ", "ctr", - "unit", "ltd", "minist", "educ", "grad ", " sch ","sch "," sch", - "coll ", " sci ", "natl", "&", " med","med ", - "publ", "dept", "biomed", "phys", "technol", - "engn") - pattern <- paste(tech_words, collapse = "|") - - a_df$city<- ifelse((a_df$city!="esch sur alzette" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), - a_df$state, a_df$city) - - - a_df$state<- ifelse(a_df$state==a_df$city2,NA,a_df$state) - - - a_df$state<- ifelse(grepl("[[:digit:]]",a_df$state), - NA, a_df$state) - - - - a_df$state<-ifelse(a_df$state=="",NA,a_df$state) - a_df$postal_code<-ifelse(a_df$postal_code=="",NA,a_df$postal_code) - - us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", - "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", - "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", - "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", - "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") - pattern <- paste(us_state_abbreviations_lower, collapse = "|") - a_df$country_list<-country_list - a_df$state<- ifelse((a_df$country=="usa" & is.na(a_df$state) & grepl(pattern, a_df$address, ignore.case = TRUE, perl = TRUE)), - a_df$country_list,a_df$state) - - - - a_df$state<- ifelse((a_df$country=="usa" & grepl("[[:digit:]]",a_df$state)), - gsub("[[:digit:]]","", a_df$state),a_df$state) - a_df$state<- ifelse((a_df$country=="usa" & grepl("usa",a_df$state)), - gsub("usa","", a_df$state),a_df$state) - a_df$state<- trimws(a_df$state, which = "both") - - - # fixing japanese prefectures and cities, which are sometimes swapped. - - - to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", - "dept", "div", "univ", "hosp", "coll", "sci", "rd", - "program","minist", "educ", "sch ", "grad ", "fac ", - "assoc","forest", "corp") - pattern <- paste(to_delete, collapse = "|") - a_df$city2<- ifelse((a_df$country=="japan" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA,a_df$city2) - - # Remove any with numbers - a_df$city2<- ifelse((a_df$country=="japan" & grepl("[[:digit:]]", a_df$city2)), - NA,a_df$city2) - - - - japan_prefectures <- c("hokkaido","aomori", "iwate","miyagi","akita", - "yamagata","fukushima","ibaraki","tochigi","gunma", - "saitama","chiba","tokyo","kanagawa","niigata", - "toyama","ishikawa","fukui","yamanashi","nagano","gifu", - "shizuoka","aichi","mie","shiga","kyoto","osaka","gumma", - "hyogo","nara","wakayama","tottori","shimane", - "okayama","hiroshima","yamaguchi","tokushima","kagawa", - "ehime","kochi","fukuoka","saga","nagasaki","kumamoto", - "oita","miyazaki","kagoshima","okinawa") - pattern <- paste(japan_prefectures, collapse = "|") - - - - a_df$state<- ifelse((a_df$country=="japan" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), - a_df$city,a_df$state) - - - # This removes all special regions of a city like tokyo from city2 - a_df$city2<- ifelse((a_df$country=="japan" & grepl(" ku", a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA,a_df$city2) - - # replace from city with city2 EXCEPT in cases where no state (and therefore - # city is correct) and where no city2 (otherwise would bring in NA) - - a_df$city<- ifelse((a_df$country=="japan" & !(is.na(a_df$state)) & !(is.na(a_df$city2))), - a_df$city2,a_df$city) - - - - # Fix Scotland - - a_df$city<- ifelse((a_df$country=="scotland" & grepl("univ ", a_df$city, ignore.case = TRUE, perl = TRUE)), - gsub("univ ","",a_df$city),a_df$city) - - - - to_delete <- c(" ave", " grp", "hlth", " rd", "mrc"," oba","plz", - " dr", "oqb", " quad","fisheries") - - pattern <- paste(to_delete, collapse = "|") - a_df$city<- ifelse((a_df$country=="scotland" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), - NA,a_df$city) - - - # Fix UK - to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", - "dept", "div", "univ", "hosp", "coll", "sci", "rd", - "program","minist", "educ", "sch ", "grad ", "fac ", - " sq", "quarter", " way", " dr", "diagnost", "consultant", - "microsoft","diagnost", "[[:digit:]]","project","facil", "grp", - "campus","expt"," pk", "canc","assoc","forest", "corp", - "consortium", "partners", "lane","ucl","street","trust", - "business", "inform", "royal","survey","drosophila", " st", - "ndorms", "nat hist", "hlth", " ave","council", "unit", "nerc", "nat res") - pattern <- paste(to_delete, collapse = "|") - a_df$city2<- ifelse((a_df$country=="england" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA,a_df$city2) - - a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)), - a_df$city2,a_df$city) - - - a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & - grepl("london", a_df$address, ignore.case = TRUE, perl = TRUE), - "london",a_df$city) - - a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & - grepl("cambridge", a_df$address, ignore.case = TRUE, perl = TRUE), - "cambridge",a_df$city) - - a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & - grepl("oxford", a_df$address, ignore.case = TRUE, perl = TRUE), - "oxford",a_df$city) - - a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & - grepl("durham", a_df$address, ignore.case = TRUE, perl = TRUE), - "durham",a_df$city) - - a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & - grepl("bristol", a_df$address, ignore.case = TRUE, perl = TRUE), - "bristol",a_df$city) - - a_df$city<- ifelse((a_df$country=="england" & is.na(a_df$city)) & - grepl("lancaster", a_df$address, ignore.case = TRUE, perl = TRUE), - "lancaster",a_df$city) - - - - a_df$city2 <- NULL - a_df$country_list<-NULL - - -} - - - - - - - - - - - # country<- a_df %>% - # # mutate(city_match=(city==second_tier)) %>% - # # filter(city_match==FALSE) %>% - # distinct(country) %>% - # mutate(summary=nchar(country)) %>% - # arrange(country) - # - # - # country_city<- a_df %>% - # # mutate(city_match=(city==second_tier)) %>% - # # filter(city_match==FALSE) %>% - # distinct(country,city) %>% - # mutate(city_char=nchar(city)) %>% - # arrange(country,city) - # - # - # - # country_state<- a_df %>% - # # mutate(city_match=(city==second_tier)) %>% - # # filter(city_match==FALSE) %>% - # distinct(country,state) %>% - # mutate(city_char=nchar(state)) %>% - # arrange(country,state) - # - # country_state_city<- a_df %>% - # # mutate(city_match=(city==second_tier)) %>% - # # filter(city_match==FALSE) %>% - # distinct(country ,state,city) %>% - # mutate(city_char=nchar(city)) %>% - # arrange(country,state,city) - # - # - # country_state_city_pc<- a_df %>% - # # mutate(city_match=(city==second_tier)) %>% - # # filter(city_match==FALSE) %>% - # distinct(country ,state,postal_code,city) %>% - # mutate(city_char=nchar(city)) %>% - # arrange(country,state,postal_code,city) - - - - - -# old code ---------------------------------------------------------------- - -# -# -# -# -# last_element %in% c("india", "china", -# -# -# a_df$state<- ifelse(a_df$state==a_df$city2,NA,a_df$state) -# -# -# -# ## Australia postal codes also separated -# # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) -# -# # First need to "fix" the city -# # Function to check for three characters or numbers in the city-list and replace with NA -# process_aus_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not Australia -# } -# -# if (x[[length(x)]] == "australia") { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# return(c(NA, NA)) # Not Australia -# } -# -# # Apply the function across all addresses using `mapply` -# results <- t(mapply(process_aus_address, list_address)) -# colnames(results) <- c("aus_city", "aus_state") -# -# # Clean up results -# results <- as.data.frame(results, stringsAsFactors = FALSE) -# results$aus_city[results$aus_city == "not australia"] <- NA -# results$aus_state[results$aus_state == "not australia"] <- NA -# -# # take the PC+state and assign to PC -# results$aus_pc<-results$aus_state -# # remove all digits from state -# results$aus_state <- trimws(gsub("\\d", "", results$aus_state)) -# # remove all letters from pc -# results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) -# results$aus_pc[results$aus_pc == ""] <- NA -# -# # if na in PC, assign the city (some of which have PC) -# results$aus_pc <- ifelse(is.na(results$aus_pc), results$aus_city, results$aus_pc) -# # remove all metters from pc, leaving any new pc -# results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) -# results$aus_pc[results$aus_pc == ""] <- NA -# # now remove any PC from city -# results$aus_city <- trimws(gsub("\\d", "", results$aus_city)) #alternative -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$aus_city, city_list) -# -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$aus_state, state_list) -# -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$aus_pc, pc_list) -# -# rm(results) -# -# -# -# # CANADA ------------------------------------------------------------------ -# -# ## Canada postal codes also separated -# # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) -# -# process_can_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not Canada -# } -# -# if (x[[length(x)]] == "canada") { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# return(c(NA, NA)) # Not canada -# } -# -# # Apply the function across all addresses using `mapply` -# results <- t(mapply(process_can_address, list_address)) -# colnames(results) <- c("can_city", "can_state") -# -# # Clean up results -# results <- as.data.frame(results, stringsAsFactors = FALSE) -# results$can_city[results$can_city == "not canada"] <- NA -# results$can_state[results$can_state == "not canada"] <- NA -# -# # take the PC+state and assign to PC -# results$can_pc<-results$can_state -# -# # any without numbers gets NA'd -# results$can_pc[!grepl("\\d", results$can_pc)] <- NA -# # removes state and removes ltr at start of PC -# results$can_pc<-sub("[a-z]\\s+[a-z]", "", results$can_pc) #+[a-z] -# -# -# # if na in PC, assign the city (some of which have PC) -# results$can_pc <- ifelse(is.na(results$can_pc), results$can_city, results$can_pc) -# results$can_pc[!grepl("\\d", results$can_pc)] <- NA -# -# # keep portions with numbers / remove city names -# # .*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$: This regex pattern matches any -# # characters (.*) followed by a word boundary (\\b) and exactly three word -# # characters (\\w{3}), capturing this as the first group ((\\w{3})). It -# # then matches any characters again (.*) followed by another word boundary -# # and exactly three word characters, capturing this as the second -# # group ((\\w{3})), and ensures this is at the end of the string ($). -# # sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", string): Replaces the -# # entire string with the two captured groups separated by a space. -# results$can_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$can_pc) -# -# # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr -# -# results$can_pc[results$can_pc == ""] <- NA -# # now remove any PC from city -# results$can_city<-sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", results$can_city) # all after first space -# -# -# # fix state -# results$can_state <- trimws(gsub("\\d", "", results$can_state)) -# results$can_state <-trimws(sub(" .*", "", results$can_state)) # all after first space -# results$can_state <- gsub("british", "bc", results$can_state) -# results$can_state <- gsub("nova", "ns", results$can_state) -# # fix city -# results$can_city <- trimws(gsub("\\d", "", results$can_city)) -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$can_city, city_list) -# -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$can_state, state_list) -# -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$can_pc, pc_list) -# -# rm(results) -# -# # INDIA ------------------------------------------------------------------ -# -# # India states are almost always listed but New Delhi is complicated, -# # as are any with only three entries -# # Function to process addresses for both city and state -# process_india_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not India -# } -# -# if (x[[length(x)]] == "india") { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# if (length(x) != 3 && grepl("delhi", x[[length(x) - 1]])) { -# return(c(trimws(x[[length(x) - 1]]), NA)) -# } -# -# return(c(NA, NA)) # Not India -# } -# -# # Apply the function across all addresses using `mapply` -# results <- t(mapply(process_india_address, list_address)) -# colnames(results) <- c("india_city", "india_state") -# -# # Clean up results -# results <- as.data.frame(results, stringsAsFactors = FALSE) -# results$india_city[results$india_city == "not india"] <- NA -# results$india_state[results$india_state == "not india"] <- NA -# -# # Remove numeric parts from state names and trim whitespace -# results$india_state <- trimws(gsub("\\s([0-9]+)", "", results$india_state)) -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$india_city, city_list) -# #### -# -# state_list<-ifelse(is.na(state_list),results$india_state, state_list) -# -# rm(results) -# -# -# # BRAZIL ------------------------------------------------------------------ -# -# -# # Function to process addresses for both city and state -# process_brazil_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not brl -# } -# if (x[[length(x)]] == "brazil") { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# -# return(c(NA, NA)) # Not brl -# } -# -# -# -# # Apply the function across all addresses -# results <- as.data.frame(t(mapply(process_brazil_address, list_address)), -# stringsAsFactors = FALSE) -# colnames(results) <- c("brl_city", "brl_state") -# -# -# -# -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg","andar","empresas", -# "programa", "ciencias", "unidade", "lab ") -# -# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), -# results$brl_state, -# results$brl_city) -# -# -# -# results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), -# results$brl_city, -# results$brl_state) -# -# -# # Define the function -# extract_brl_postcodes <- function(df, source_col, target_col,brl_new_city) { -# # 6 digits -# -# pattern <- "br-[0-9]{5,8}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, brl_new_city] <- df[i, source_col] -# # df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# results$brl_pc<-NA -# results$brl_new_city<-NA -# # df, source_col, target_col,city_col -# results <- extract_brl_postcodes(results, "brl_city","brl_pc", "brl_new_city") -# results <- extract_brl_postcodes(results, "brl_state","brl_pc", "brl_new_city") -# -# -# results$brl_new_city <- ifelse(is.na(results$brl_new_city), -# results$brl_state, -# results$brl_new_city) -# -# -# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, -# results$brl_city, -# results$brl_state) -# -# -# -# -# results$brl_new_city <- ifelse(is.na(results$brl_new_city), -# results$brl_city, -# results$brl_new_city) -# -# -# -# -# results$brl_city<-gsub("br-","",results$brl_city) -# results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) -# results$brl_city<-sub("-","",results$brl_city) -# -# -# results$brl_new_city<-gsub("br-","",results$brl_new_city) -# results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) -# results$brl_new_city<-sub("-","",results$brl_new_city) -# -# -# results$brl_pc<-gsub("br-","",results$brl_pc) -# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) -# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) -# -# results$brl_state<-gsub("br-","",results$brl_state) -# results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) -# results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) -# -# # -# # -# # -# # -# # -# # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) -# # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) -# # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) -# # # -# # results$brl_city<-gsub("-","",results$brl_city) -# # -# # results$brl_state<-gsub("br-","",results$brl_state) -# # results$brl_state<-gsub("-","",results$brl_state) -# -# -# -# -# -# # any without numbers gets NA'd -# results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA -# -# # keep portions with numbers / remove city names -# -# results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) -# results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) -# # -# # # Specific replacements -# # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), -# # "rio de janeiro", results$brl_city) -# -# results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), -# "rio de janeiro", results$brl_city) -# results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), -# "rj", results$brl_state) -# results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), -# "sp", results$brl_state) -# -# -# -# results$brl_city[results$brl_city==results$brl_state]<-NA -# -# -# # Clean up and adjust columns -# results[] <- lapply(results, trimws) -# -# -# # Define city-to-state mapping -# city_state_mapping <- data.frame( -# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), -# state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), -# stringsAsFactors = FALSE -# ) -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$city[i], results$brl_city) -# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brl_state) -# } -# -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# -# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brl_state) -# } -# -# -# -# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, -# results$brl_city, -# results$brl_state) -# -# -# results$brl_city <- trimws(results$brl_city, which = "both") -# results$brl_state <- trimws(results$brl_state, which = "both") -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg", -# "programa", "ciencias", "unidade", "lab ") -# -# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), -# results$brl_state, -# results$brl_city) -# -# -# # Final trimming -# results[] <- lapply(results, trimws) -# -# results$brl_city <- ifelse(results$brl_new_city==results$brl_city, -# NA, -# results$brl_city) -# -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$brl_city, city_list) -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$brl_state, state_list) -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$brl_pc, pc_list) -# -# -# rm(results,city_state_mapping) -# -# -# -# # Handle postal codes (BR-[0-9]) -# results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_state), results$brazil_state, NA) -# results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_city) & is.na(results$brazil_pc), -# results$brazil_city, results$brazil_pc) -# -# # Remove BR codes from city and state -# results$brazil_city <- gsub("br-[0-9]+", "", results$brazil_city) -# results$brazil_state <- gsub("br-[0-9]+", "", results$brazil_state) -# -# # Define city-to-state mapping -# city_state_mapping <- data.frame( -# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "sao paulo"), -# state = c("sp", "sp", "sp", "sp", "rj", "rj", "sp"), -# stringsAsFactors = FALSE -# ) -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# results$brazil_city <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), -# city_state_mapping$city[i], results$brazil_city) -# results$brazil_state <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brazil_state) -# } -# -# # Specific replacements -# results$brazil_city <- ifelse(grepl("museu nacl", results$brazil_city), -# "rio de janeiro", results$brazil_city) -# results$brazil_state <- ifelse(grepl("rio de janeiro", results$brazil_state, ignore.case = TRUE), -# "rj", results$brazil_state) -# results$brazil_state <- ifelse(grepl("sao paulo", results$brazil_state, ignore.case = TRUE), -# "sp", results$brazil_state) -# -# # cleanup -# results$brazil_city[results$brazil_city==results$brazil_state]<-NA -# results$brazil_city <- trimws(results$brazil_city, which = "both") -# results$brazil_state <- trimws(results$brazil_state, which = "both") -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg", -# "programa", "ciencias", "unidade", "lab ") -# -# results$brazil_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brazil_city), -# results$brazil_state, -# results$brazil_city) -# -# # Clean postal codes -# results$brazil_pc <- gsub("[A-Za-z-]", "", results$brazil_pc) -# -# # Final trimming -# results[] <- lapply(results, trimws) -# -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$brazil_city, city_list) -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$brazil_state, state_list) -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$brazil_pc, pc_list) -# -# -# rm(results,city_state_mapping) -# -# -# # CHINA ------------------------------------------------------------------- -# chn_extract <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) -# } else if (x[[length(x)]] == "china") { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) -# } else { -# return(c(NA, NA)) -# } -# } -# -# chn_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) -# names(chn_pc) <- c("chn_city", "chn_state") -# -# # Define the function -# extract_china_postcodes <- function(df, source_col, target_col,chn_new_city) { -# # 6 digits -# pattern <- "[0-9]{6}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, chn_new_city] <- df[i, source_col] -# # df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# chn_pc$chn_pc<-NA -# chn_pc$chn_new_city<-NA -# # df, source_col, target_col,city_col -# chn_pc <- extract_china_postcodes(chn_pc, "chn_city","chn_pc", "chn_new_city") -# chn_pc <- extract_china_postcodes(chn_pc, "chn_state","chn_pc", "chn_new_city") -# -# -# -# # any without numbers gets NA'd -# chn_pc$chn_pc[!grepl("\\d", chn_pc$chn_pc)] <- NA -# -# # keep portions with numbers / remove city names -# chn_pc$chn_new_city<-sub("[0-9]{6}","",chn_pc$chn_new_city) -# chn_pc$chn_city<-sub("[0-9]{6}","",chn_pc$chn_city) -# chn_pc$chn_state<-sub("[0-9]{6}","",chn_pc$chn_state) -# -# -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# -# to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", -# "dept", "div", "univ", "hosp", "coll", "sci", "rd", -# "program","minist", "educ", "sch ", "grad ", "fac ", -# "assoc","forest") -# -# # Define the function -# -# -# # Print the resulting dataframe -# print(a_df) -# -# -# chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")]<-lapply(chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")], trimws) -# -# -# clean_column <- function(column, delete_terms) { -# column <- gsub(paste(delete_terms, collapse = "|"), NA, column) -# column <- gsub("[0-9]", "", column) # Remove digits -# trimws(column) # Remove leading/trailing whitespace -# } -# -# -# # Clean chn_pc1 and chn_pc2 -# chn_pc$chn_city <- clean_column(chn_pc$chn_city, to_delete) -# chn_pc$chn_new_city <- clean_column(chn_pc$chn_new_city, to_delete) -# chn_pc$chn_state <- clean_column(chn_pc$chn_state, to_delete) -# -# -# chn_pc$chn_new_city <- ifelse(is.na(chn_pc$chn_new_city), -# chn_pc$chn_state, -# chn_pc$chn_new_city) -# -# -# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), -# NA,chn_pc$chn_state) -# -# -# -# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_city)==FALSE), -# chn_pc$chn_city,chn_pc$chn_state) -# -# -# -# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), -# NA,chn_pc$chn_state) -# -# chn_pc$chn_state <- gsub(" province", "",chn_pc$chn_state) -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), chn_pc$chn_new_city, city_list) -# -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),chn_pc$chn_state, state_list) -# -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),chn_pc$chn_pc, pc_list) -# -# rm(chn_pc) -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # ### China has some where the postal code is with the city, so fix those here -# # # Extract postal code information from the list -# # chn_extract <- function(x) { -# # if (length(x) == 1) { -# # return(c(NA, NA)) -# # } else if (x[[length(x)]] == "china") { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) -# # } else { -# # return(c(NA, NA)) -# # } -# # } -# -# # Apply extraction to list_address -# # chn_missing_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) -# # names(chn_missing_pc) <- c("chn_pc1", "chn_pc2") -# # -# # # Define words indicating this is actually a dept not state or postal code -# # # will use this list to delete the ones that don't apply -# # to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", -# # "dept", "div", "univ", "hosp", "coll", "sci", "rd","program" -# # "minist", "educ", "sch ", "grad ", "fac ","assoc") -# # -# # -# # -# # # Extract numeric postal codes -# # chn_missing_pc$chn_pc_from1 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc1)) -# # chn_missing_pc$chn_pc_from2 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc2)) -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # clean_column <- function(column, delete_terms) { -# # column <- gsub(paste(delete_terms, collapse = "|"), NA, column) -# # column <- gsub("[0-9]", "", column) # Remove digits -# # trimws(column) # Remove leading/trailing whitespace -# # } -# # -# # # Clean chn_pc1 and chn_pc2 -# # chn_missing_pc$chn_pc1 <- clean_column(chn_missing_pc$chn_pc1, to_delete) -# # chn_missing_pc$chn_pc2 <- clean_column(chn_missing_pc$chn_pc2, to_delete) -# # -# # # Initialize empty columns for final outputs -# # chn_missing_pc$chn_pc <- NA -# # chn_missing_pc$chn_city <- NA -# # chn_missing_pc$chn_state <- NA -# # -# # # Assign postal codes, cities, and states based on conditions -# # assign_chn_data <- function(from1, from2, pc1, pc2) { -# # list( -# # chn_pc = ifelse(is.na(from1) & !is.na(from2), from2, from1), -# # chn_city = ifelse(is.na(from1) & !is.na(from2), pc2, pc1), -# # chn_state = ifelse(is.na(from1) & !is.na(from2), pc1, pc2) -# # ) -# # } -# # -# # chn_result <- assign_chn_data(chn_missing_pc$chn_pc_from1, -# # chn_missing_pc$chn_pc_from2, -# # chn_missing_pc$chn_pc1, -# # chn_missing_pc$chn_pc2) -# # -# # chn_missing_pc$chn_pc <- chn_result$chn_pc -# # chn_missing_pc$chn_city <- gsub("[0-9]", "", chn_result$chn_city) -# # chn_missing_pc$chn_state <- gsub("[0-9]", "", chn_result$chn_state) -# -# # # Define Chinese states and cities -# # chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", -# # "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", -# # "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", -# # "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", -# # "gansu", "inner mongolia", "jilin", "hainan", "ningxia", -# # "qinghai", "tibet", "macao") -# # -# # # All the cities in the addresses, add as needed. -# # chn_cities <- unique(c(chn_missing_pc$chn_city, "lhasa")) -# -# # Update states and cities based on matching conditions -# # chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc1 %in% chn_states, -# # chn_missing_pc$chn_pc1, chn_missing_pc$chn_state) -# # chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc2 %in% chn_states, -# # chn_missing_pc$chn_pc2, chn_missing_pc$chn_state) -# # -# # chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc1 %in% chn_states), -# # chn_missing_pc$chn_pc1, chn_missing_pc$chn_city) -# # chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc2 %in% chn_states), -# # chn_missing_pc$chn_pc2, chn_missing_pc$chn_city) -# # -# # # put the postal codes and cities in the pc_list, state_list -# # pc_list<-ifelse(is.na(pc_list),chn_missing_pc$chn_pc, pc_list) -# # city_list<-ifelse(is.na(city_list),chn_missing_pc$chn_city, city_list) -# # state_list<-ifelse(is.na(state_list),chn_missing_pc$chn_state, state_list) -# # -# # -# # rm(chn_cities,chn_states,to_delete,chn_result,chn_missing_pc) -# -# -# -# -# -# -# -# -# -# # UK ------------------------------------------------------------------ -# -# process_uk_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not uk -# } -# -# if ((x[[length(x)]] == "england")| -# (x[[length(x)]] == "scotland")| -# (x[[length(x)]] == "wales")| -# (x[[length(x)]] == "northern ireland")) { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# return(c(NA, NA)) # Not uk -# } -# -# # Apply the function across all addresses using `mapply` -# results <- t(mapply(process_uk_address, list_address)) -# colnames(results) <- c("uk_city", "uk_state") -# -# -# -# -# # Clean up results -# results <- as.data.frame(results, stringsAsFactors = FALSE) -# results$uk_city[results$uk_city == "not uk"] <- NA -# results$uk_state[results$uk_state == "not uk"] <- NA -# -# -# -# results$uk_pc<-NA -# -# # Define the function -# extract_uk_postcodes <- function(df, source_col, target_col,city_col) { -# # Regular expression pattern for UK postal codes -# # One or two initial letters. -# # One or two digits (and possibly a letter). -# # A mandatory space. -# # One digit followed by two letters. -# pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" -# pattern2 <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" -# pattern3 <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" -# -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# # Example usage -# -# results <- extract_uk_postcodes(results, "uk_city","uk_pc", "uk_city") -# results <- extract_uk_postcodes(results, "uk_state","uk_pc", "uk_city") -# -# # any without numbers gets NA'd -# results$uk_pc[!grepl("\\d", results$uk_pc)] <- NA -# -# -# -# results$new_city<-NA -# -# -# # Define the function -# uk_city_id <- function(df, source_col, target_col) { -# # Regular expression pattern for UK postal codes -# # One or two initial letters. -# # One or two digits (and possibly a letter). -# # A mandatory space. -# # One digit followed by two letters. -# pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- df[i, source_col] -# } -# } -# return(df) -# } -# -# # Example usage -# -# results <- uk_city_id(results, "uk_city","new_city") -# results <- uk_city_id(results, "uk_state","new_city") -# -# -# results$new_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$new_city) -# -# -# -# -# # Define the function -# uk_city_id <- function(df, source_col, target_col) { -# # Regular expression pattern for UK postal codes -# # One or two initial letters. -# # One or two digits (and possibly a letter). -# # A mandatory space. -# # One digit followed by two letters. -# pattern <- "\\s[A-Za-z0-9]{2,4}\\s[A-Za-z0-9]{2,4}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- df[i, source_col] -# } -# } -# return(df) -# } -# -# # Example usage -# results <- uk_city_id(results, "uk_state","new_city") -# -# -# -# results$uk_state<-ifelse(results$uk_state==results$uk_city, -# "",results$uk_state) -# -# results$new_city<-ifelse(is.na(results$new_city), -# results$uk_city, -# results$new_city) -# # remove zip codes from new city -# results$new_city<-gsub("\\s[A-Za-z0-9]{3}\\s[A-Za-z0-9]{3}","",results$new_city) -# results$new_city<-gsub("\\s[A-Za-z0-9]{4}\\s[A-Za-z0-9]{2,3}","",results$new_city) -# -# -# results$new_city<-ifelse(results$uk_state=="london", -# "london", -# results$new_city) -# -# -# results$uk_state<-ifelse(results$uk_state=="london", -# NA, -# results$uk_state) -# -# -# -# -# -# -# -# -# -# -# # keep portions with numbers / remove city names -# # results$uk_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_city) -# # results$uk_state<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_state) -# # results$uk_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$uk_pc) -# -# # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr -# -# results$uk_pc[results$uk_pc == ""] <- NA -# # now remove any PC from city -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$new_city, city_list) -# -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$uk_state, state_list) -# -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$uk_pc, pc_list) -# -# rm(results) -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# # Extracts postal code when combined with city name ----------------------- -# -# city_list <- trimws(city_list, which = "both") -# -# -# -# city_clean<-data.frame( -# authorID=ID, -# addresses=addresses, -# original_city=city_list, -# city_list=city_list, -# state_list=state_list, -# country_list=country_list, -# extract_pc=pc_list) -# -# -# # # England, Scotland, Wales --------------------------------------------- -# # -# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc)& (city_clean$country_list=="england"| -# # city_clean$country_list=="wales" | -# # city_clean$country_list=="scotland"), -# # gsub(".*\\s([a-z0-9]+\\s[a-z0-9]+)$", "\\1", city_clean$city_list), -# # city_clean$extract_pc) -# # -# # # This then deletes the postal code from the city name -# # city_clean$city_list<-ifelse((city_clean$country_list=="england"| -# # city_clean$country_list=="wales" | -# # city_clean$country_list=="scotland"), -# # (gsub("([A-Za-z0-9]+\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), -# # city_clean$city_list) -# # -# city_clean[city_clean == ""] <- NA -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- "\\b[A-Za-z]{1,2}[-][0-9]{4,8}\\b" -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# } -# } -# return(df) -# } -# -# # Example usage -# -# city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") -# -# -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- " [0-9]{3,9}" -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# } -# } -# return(df) -# } -# -# -# # Example usage -# -# city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") -# -# -# -# # Define the function -# delete_matching_text <- function(df, col_a, col_b) { -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Check if the value in column A is not NA and is found within the text in column B -# if(!is.na(df[i, col_a]) && grepl(df[i, col_a], df[i, col_b])) { -# # Remove the matching text from column B by replacing it with an empty string -# df[i, col_b] <- gsub(df[i, col_a], "", df[i, col_b]) -# } -# } -# return(df) -# } -# -# city_clean <- delete_matching_text(city_clean, "extract_pc","city_list") -# -# city_clean <- delete_matching_text(city_clean, "extract_pc","state_list") -# -# -# # remove state if same as city -# -# city_clean$state_list<-ifelse(city_clean$state_list==city_clean$city_list,NA, city_clean$state_list) -# -# -# # there are some usa ones that had state zip buyt no country -# -# -# # Define the function -# extract_postcodes <- function(df, country_list, extract_pc, state_list) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- "\\b[A-Za-z]{2} [0-9]{5}\\b" -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, country_list]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, country_list], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, extract_pc])) { -# df[i, extract_pc] <- extracted_codes[[1]][1] -# df[i, state_list] <- extracted_codes[[1]][1] -# df[i, country_list] <- "usa" -# } -# } -# return(df) -# } -# -# -# # Example usage -# -# city_clean <- extract_postcodes(city_clean, -# "country_list", "extract_pc", "state_list") -# -# -# # remove zip codes from states -# city_clean$state_list<-ifelse(city_clean$country_list=="usa", -# gsub("[0-9]","",city_clean$state_list), -# city_clean$state_list) -# -# # remove state from zipcode -# city_clean$extract_pc<-ifelse(city_clean$country_list=="usa", -# gsub("[a-z]","",city_clean$extract_pc), -# city_clean$extract_pc) -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# city_clean$extract_pc <- trimws(gsub("&", "", city_clean$extract_pc)) -# city_clean[city_clean == ""] <- NA -# -# # -# # -# # -# # -# # -# head(city_clean) -# # -# # # Cleaning up city names with zip codes in them -# # # take the zip code and put it in a new column before deleting -# # -# # -# # -# # -# # # 2) Countries with postal code AFTER city -------------------------------- -# # -# # -# # city_zip<-c("ireland","qatar","kazakhstan","peru","turkey","south korea", -# # "japan","costa rica","mexico","new zealand","iran","thailand", -# # "russia","spain","india","singapore","indonesia","chile", -# # "finland","colombia","taiwan","saudi arabia","uruguay", -# # "slovenia","spain") -# # -# # -# # -# # city_clean$extract_pc<-ifelse((is.na(city_clean$extract_pc)& (city_clean$country_list %in% city_zip)), -# # (gsub(".*[A-Za-z]+", "", city_clean$city_list)), -# # city_clean$extract_pc) -# # # -# # city_clean$city_list<-ifelse((city_clean$country_list %in% city_zip), -# # gsub("\\s([0-9]+)", "", city_clean$city_list), -# # city_clean$city_list) -# # -# # city_clean[city_clean == ""] <- NA -# # -# # -# # # 3) Postal code and dash BEFORE city name -------------------------------- -# # -# # -# # zip_dash<-c("finland","slovakia","austria","portugal","belgium", -# # "spain","israel","czech republic","argentina","france", -# # "sweden","switzerland","turkey","germany","italy", -# # "lithuania","hungary","denmark","poland","norway", "iceland", -# # "greece", "ukraine","estonia","latvia","luxembourg","lativa", -# # "south africa","bulgaria","brazil") -# # -# # -# # -# # -# # city_clean$extract_pc <- ifelse((is.na(city_clean$extract_pc) & -# # (city_clean$country_list %in% zip_dash)), -# # # gsub("\\s([A-Za-z]+)", "", city_clean$city_list), -# # sub(" .*", "", city_clean$city_list), -# # city_clean$extract_pc) -# # -# # -# # city_clean$city_list<-ifelse((city_clean$country_list %in% zip_dash), -# # gsub("[a-zA-Z]+-[0-9]+", "", city_clean$city_list), -# # city_clean$city_list) -# # -# # -# # city_clean[city_clean == ""] <- NA -# # -# # # 4) Netherlands Postal Code ---------------------------------------------- -# # -# # # Netherlands has Postal Code before -# # # it is a combination of 2-3 blocks of letters and numbers -# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="netherlands"), -# # (gsub("^(([^ ]+ ){2}).*", "\\1", city_clean$city_list)), -# # city_clean$extract_pc) -# # city_clean$city_list<-ifelse((city_clean$country_list=="netherlands"), -# # (gsub(".*\\s", "", city_clean$city_list)), -# # city_clean$city_list) -# # city_clean[city_clean == ""] <- NA -# # -# # # 5) Venezuela ----------------------------------------------------------- -# # -# # # Venezuela has postal code after, it is combo of letters and numbers -# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="venezuela"), -# # gsub(".*\\s([a-z0-9]+)$", "\\1", city_clean$city_list), -# # city_clean$extract_pc) -# # -# # # This then deletes the postal code from the city name -# # city_clean$city_list<-ifelse(city_clean$country_list=="venezuela", -# # (gsub("(\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), -# # city_clean$city_list) -# # city_clean[city_clean == ""] <- NA -# # -# # -# # -# # # trim ws -# # city_clean$extract_pc <- trimws(city_clean$extract_pc, which = "both") -# # city_clean$city_list <- trimws(city_clean$city_list, which = "both") -# # # This removes any that don't have numbers in them -# # city_clean$extract_pc[!grepl("[0-9]", city_clean$extract_pc)] <- NA -# # city_clean$city_list <- gsub("[0-9]+", "", city_clean$city_list) -# # -# # -# # Final Clean Up ---------------------------------------------------------- -# -# # Russia -# city_clean$city_list<-ifelse(city_clean$country_list=="russia", -# (gsub("st Petersburg p", "st petersburg", city_clean$city_list)), -# city_clean$city_list) -# -# -# -# # India -# India_delete<- c("dept") -# -# city_clean$city_list <- ifelse(city_clean$country_list=="india", -# gsub(paste(India_delete, collapse = "|"), NA, city_clean$city_list), -# city_clean$city_list) -# city_clean$city_list<-trimws(city_clean$city_list) # Remove leading/trailing whitespace -# -# # brazil -# city_clean$extract_pc<-ifelse((city_clean$country_list=="brazil" & nchar(city_clean$extract_pc) < 5), -# "",city_clean$extract_pc) -# city_clean[city_clean == ""] <- NA -# -# brazil_delete<- c("barretos canc hosp","univ fed sao paulo", -# "escola filosofia letras & ciencias humanas", -# "hosp sirio libanes","perola byington hosp", -# "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", -# "lab","zoologia", "inst", "programa","ppg", "ppg") -# -# -# city_clean$city_list <- ifelse(city_clean$country_list=="brazil", -# gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), -# city_clean$city_list) -# -# city_clean$city_list<-trimws(city_clean$city_list, which="both") # Remove leading/trailing whitespace -# city_clean$state_list<-trimws(city_clean$state_list, which="both") # Remove leading/trailing whitespace -# city_clean$country_list<-trimws(city_clean$country_list, which="both") # Remove leading/trailing whitespace) # Remove leading/trailing whitespace -# # City Abbreviations -# -# city_clean$city_list<-ifelse(city_clean$city_list=="university pk", -# "university park", -# city_clean$city_list) -# -# -# -# city_clean$city_list<-ifelse(city_clean$city_list=="college stn", -# "college station", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse(city_clean$city_list=="n chicago", -# "north chicago", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse(city_clean$city_list=="college pk", -# "college park", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse(city_clean$city_list=="research triangle pk", -# "research triangle park", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse(city_clean$city_list=="state coll", -# "state college", -# city_clean$city_list) -# -# # city corrections -# city_clean$city_list<-ifelse((city_clean$city_list=="dehra dun" & city_clean$country_list == "india"), -# "dehradun", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse((city_clean$city_list=="st john" & city_clean$country_list == "canada"), -# "st. john's", -# city_clean$city_list) -# -# city_clean$state_list<-ifelse(city_clean$state_list=="london ", -# "london", -# city_clean$state_list) -# -# city_clean$city_list<-ifelse((city_clean$state_list=="london" & city_clean$country_list == "england"), -# "london", -# city_clean$city_list) -# -# -# city_clean$state_list<-ifelse(city_clean$state_list=="london", -# NA, -# city_clean$state_list) -# -# -# -# city_list<-city_clean$city_list -# state_list<-city_clean$state_list -# pc_list<-city_clean$extract_pc -# country_list<-city_clean$country_list -# -# -# rm(brazil_delete,India_delete) -# -# -# # rm(city_clean) -# -# -# pc_list[pc_list == ""] <- NA -# city_list[city_list == ""] <- NA -# state_list[state_list == ""] <- NA -# dept_list[dept_list == ""] <- NA -# country_list[country_list == ""] <- NA -# # Create the df that will be returned -# cleaned_ad<-data.frame(ID, -# addresses, -# university_list, -# dept_list, -# city_list, -# country_list, -# state_list, -# pc_list) -# -# -# -# # names(cleaned_ad) -# -# -# list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) -# -# # Because formats of address printing is different across platforms -# # We are going to split using a tier system assuming first and last -# # info is somewhat reliable and guess the other info from the -# # remaining position of the info -# -# second_tier_list <- lapply(list_address1, function(x) x[length(x)]) -# second_tier_list <- trimws(second_tier_list, which = "both") -# second_tier_list[second_tier_list == "character(0)"] <- NA -# -# list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) -# -# third_tier_list <- lapply(list_address2, function(x) x[length(x)]) -# third_tier_list <- trimws(third_tier_list, which = "both") -# third_tier_list[third_tier_list == "character(0)"] <- NA -# -# # All remaining info is just shoved in this category -# remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) -# remain_list <- trimws(remain_list, which = "both") -# remain_list[remain_list == "character(0)"] <- NA -# -# -# # original -# # a_df <- data.frame( -# # adID = ID, university = university_list, -# # country = country_list, -# # state = state_list, postal_code = pc_list, city = NA, -# # department = NA, second_tier = second_tier_list, -# # third_tier = third_tier_list, -# # remain = remain_list, address = addresses, -# # stringsAsFactors = FALSE -# # ) -# -# # EB EDIT -# a_df_1 <- data.frame( -# adID = ID, -# university_1 = university_list, -# university = university_list, -# country_1 = country_list, -# country = country_list, -# state_1 = state_list, -# state = state_list, -# postal_code_1 = pc_list, -# postal_code = pc_list, -# city_1 = city_list, -# city = city_list, -# department_1 = dept_list, -# department = dept_list, -# second_tier = second_tier_list, -# third_tier = third_tier_list, -# remain = remain_list, -# address = addresses, -# stringsAsFactors = FALSE -# ) -# -# a_df_1$city_list<-ifelse(is.na(a_df_1$city_1), -# a_df_1$city, -# a_df_1$city_1) -# -# a_df_1$postal_code_1<-ifelse(is.na(a_df_1$postal_code_1), -# a_df_1$postal_code, -# a_df_1$postal_code_1) -# -# -# # Quebec Postal Code is PQ (Fr) or QC (En) but only QC is georeferenced -# a_df_1$state_1<-ifelse(a_df_1$state_1=="pq" & a_df_1$country_1 == "canada", -# "qc", -# a_df_1$state_1) -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# a_df<-a_df_1 -# -# rm(a_df_1) -# -# -# -# # try to fix the usa spots, which vary in format than other countries -# a_df$state<-ifelse(is.na(a_df$state),"",a_df$state) # added by eb to deal with index problem -# a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0] -# a_df$state[nchar(a_df$state) == 0] <- NA -# a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA -# a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) & -# !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) & -# !is.na(a_df$postal_code) & !is.na(a_df$state)] -# # fix a US problem when usa is not tacked onto the end -# -# us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}" -# a_df$state[ grepl(us_reg, a_df$country) ] <- -# substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2) -# -# a_df$postal_code[ grepl(us_reg, a_df$country) ] <- -# substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8) -# -# a_df$country[grepl(us_reg, a_df$country)] <- "usa" -# -# -# a_df$state_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", -# a_df$state, -# a_df$state_1) -# -# a_df$postal_code_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", -# a_df$postal_code, -# a_df$postal_code_1) -# -# -# a_df$country_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", -# a_df$country, -# a_df$country_1) -# -# -# ########################## -# # We'll use regular expression to pull zipcodes -# # These formats differ by region -# int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}" -# int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]", -# "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="") -# int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}" -# int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" -# int <- paste(int1, int2, int3, int4, sep = "|") -# -# uk <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]", -# "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="") -# -# mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well -# -# panama <- "[[:digit:]]{4}-[[:digit:]]{5}" -# -# zip_search <- paste0(int, "|", uk, "|", mexico, "|", panama) -# -# -# -# -# -# # ADDRD EB INSTEAD OF -# a_df$city_1 <- ifelse(is.na(a_df$city_1),a_df$third_tier,a_df$city_1) -# a_df$state_1 <- ifelse(is.na(a_df$state_1),a_df$second_tier,a_df$state_1) -# a_df$postal_code_1 <- ifelse(is.na(a_df$postal_code_1),a_df$second_tier,a_df$postal_code_1) -# -# -# # fix country - usa -# # Function to remove everything before " usa" -# remove_before_usa <- function(x) { -# if (grepl(" usa", x)) { -# return(sub(".*(?= usa)", "", x, perl = TRUE)) -# } else { -# return(x) -# } -# } -# -# # Apply the function to each element in the vector -# a_df$country_1 <- sapply(a_df$country_1, remove_before_usa) -# a_df$country_1 <- trimws(a_df$country_1, which = "both") -# -# -# a_df$state_1 <- ifelse(a_df$country_1=="usa", -# (trimws(gsub("[0-9]", "", a_df$state_1), which="both")), -# a_df$state_1) -# -# a_df$postal_code_1 <- ifelse(a_df$country_1=="usa", -# (trimws(gsub("[a-z]", "", a_df$postal_code_1), which="both")), -# a_df$postal_code_1) -# -# -# -# ########################### -# id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) & -# a_df$address != "Could not be extracted"] -# ########################### -# -# # We now iteratively run through the addresses using the concept that -# # certain information always exists next to each other. -# # Ex. city, state, country tend to exist next to each other. -# # We use the position of the zipcode also to help guide us -# # in where the information lies as well as how many fields were -# # given to us. -# for (i in id_run) { -# found <- FALSE -# row <- which(a_df$adID == i) -# university <- a_df$university[row] -# second_tier <- a_df$second_tier[row] -# third_tier <- a_df$third_tier[row] -# remain <- a_df$remain[row] -# city <- a_df$city[row] -# state <- a_df$state[row] -# postal_code <- a_df$postal_code[row] -# department <- a_df$department[row] -# grepl(zip_search, second_tier) -# grepl(zip_search, third_tier) -# # 2nd tier -# if (grepl(zip_search, second_tier)) { -# found <- TRUE -# postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier)) -# city <- gsub(zip_search, "", second_tier) -# department <- ifelse(is.na(remain), third_tier, remain) -# } -# # 3RD tiers -# if (grepl(zip_search, third_tier) & !found) { -# found <- TRUE -# postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier)) -# city <- gsub(zip_search, "", third_tier) -# state <- second_tier -# department <- remain -# } -# -# if (!found) { -# state <- second_tier -# city <- third_tier -# department <- remain -# } -# # To make university searching more efficient we'll override values -# # based on if it has university/college in the name, -# # where university overides college -# override_univ <- grepl("\\buniv\\b|\\buniversi", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) & -# !grepl("\\bdrv\\b|\\bdrive\\b", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) -# -# if (any(override_univ)) { -# university <- -# c(second_tier, third_tier, remain, city, university)[override_univ][1] -# assign( -# c("second_tier", "third_tier", "remain", "city", "university")[ -# override_univ][1], -# NA -# ) -# } -# # only if university doesnt already exist -# override_univ_col <- -# grepl("\\bcol\\b|college|\\bcoll\\b", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) & -# !grepl("\\bdrv\\b|\\bdrive\\b", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) -# -# if (!any(override_univ) & any(override_univ_col)) { -# university <- -# c(second_tier, third_tier, remain, city, university )[ -# override_univ_col][1] -# -# assign( -# c("second_tier", "third_tier", "remain", "city", "university")[ -# override_univ_col][1], -# NA -# ) -# } -# # more risky, but institutions as well, just incase its not a university -# override_univ_inst <- grepl("\\binst\\b|\\binstitut", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) -# if ( -# !any(override_univ) & !any(override_univ_col) & any(override_univ_inst) -# ) { -# department <- c(second_tier, third_tier, remain, city, university )[ -# override_univ_inst][1] -# -# assign( -# c("second_tier", "third_tier", "remain", "city", "university")[ -# override_univ_inst][1], -# NA -# ) -# } -# -# a_df$city[row] <- gsub("[[:digit:]]", "", city) -# a_df$state[row] <- gsub("[[:digit:]]", "", state) -# a_df$postal_code[row] <- postal_code -# a_df$department[row] <- department -# -# -# -# #########################Clock############################### -# total <- length(id_run) -# pb <- utils::txtProgressBar(min = 0, max = total, style = 3) -# utils::setTxtProgressBar(pb, which(id_run == i)) -# ############################################################# -# } -# -# -# city_fix <- is.na(a_df$city) & !is.na(a_df$state) -# a_df$city[city_fix] <- a_df$state[city_fix] -# a_df$state[city_fix] <- NA -# a_df$university[a_df$university == "Could not be extracted"] <- NA -# a_df$country[a_df$country == "Could not be extracted"] <- NA -# # a_df$country[a_df$country == "peoples r china"] <- "China" -# # a_df$country[a_df$country == "U Arab Emirates"] <- "United Arab Emirates" -# # a_df$country[a_df$country == "Mongol Peo Rep"] <- "Mongolia" -# -# a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <- -# vapply(strsplit( -# a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)], -# "-"), -# function(x) x[2], character(1) -# ) -# #strip periods from the ends of city,state,country -# a_df$city <- gsub("\\.", "", a_df$city) -# a_df$state <- gsub("\\.", "", a_df$state) -# a_df$country <- gsub("\\.", "", a_df$country) -# a_df$country[a_df$country == ""] <- NA -# a_df$university[a_df$university == ""] <- NA -# a_df$postal_code[a_df$postal_code == ""] <- NA -# #convert to lower -# for (l in 2:ncol(a_df)){ -# a_df[, l] <- tolower(a_df[, l]) -# } -# -# # Select columns -# a_df <- a_df[, c("adID", -# "university_1", -# "country_1", -# "state_1", -# "postal_code_1", -# "city_1", -# "department_1", -# "second_tier", -# "third_tier", -# "remain", -# "address") -# ] -# -# # Rename columns -# colnames(a_df) <- c("adID", -# "university", -# "country", -# "state", -# "postal_code", -# "city", -# "department", -# "second_tier", -# "third_tier", -# "remain", -# "address") -# -# -# # sometimes the postal code fails to prse out of state. canm use this -# # when postal code is missing, but then need to remove -# # Function to extract numbers from one column and copy them to another column -# extract_numbers <- function(df, source_col, target_col) { -# if (is.na(target_col)) { -# -# -# df[[target_col]] <- gsub(".*?(\\d+).*", "\\1", df[[source_col]]) -# df[[target_col]][!grepl("\\d", df[[source_col]])] <- NA -# return(df) -# -# } else { -# return(df) -# } -# -# } -# -# # Apply the function to the dataframe -# a_df <- extract_numbers(a_df, "state", "postal_code") -# -# -# -# # ther postal code and city are sometimes in tier3 -# a_df$city <- ifelse(is.na(a_df$city), a_df$third_tier, a_df$city) -# a_df$postal_code <- ifelse(is.na(a_df$postal_code), a_df$third_tier, a_df$postal_code) -# -# -# -# -# -# # Function to remove matching characters from col1 based on col2 -# remove_matching <- function(col1, col2) { -# pattern <- paste0("\\b", gsub("([\\W])", "\\\\\\1", col2), "\\b") -# result <- sub(pattern, "", col1) -# trimws(result) -# } -# -# # Apply the function to each row -# a_df$city <- mapply(remove_matching, a_df$city, a_df$postal_code) -# a_df$state <- mapply(remove_matching, a_df$state, a_df$postal_code) -# -# -# -# library(tidyverse) -# -# -# country<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country) %>% -# mutate(summary=nchar(country)) %>% -# arrange(country) -# -# -# country_city<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,city) -# -# -# -# country_state<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country,state) %>% -# mutate(city_char=nchar(state)) %>% -# arrange(country,state) -# -# country_state_city<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country ,state,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,state,city) -# -# -# country_state_city_pc<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country ,state,postal_code,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,state,postal_code,city) -# -# return(a_df) -# } -# -# -# -# -# -# -# -# -# -# -# #. old -# country_list <- vapply(list_address, function(x) { -# gsub("\\_", "", x[length(x)]) }, -# character(1)) -# country_list <- trimws(country_list, which = "both") -# pc_list <- rep(NA, length(list_address)) -# state_list <- rep(NA, length(list_address)) -# city_list<- rep(NA, length(list_address)) -# country_list <- ifelse(grepl("usa", country_list), "usa", country_list) -# -# list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) -# -# -# -# pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("usa", -# country_list), function(x) x[1], numeric(1))) - 1), which = "right") -# state_list <- pc_list -# -# state_list[nchar(state_list) > 0] <- regmatches( -# state_list[nchar(state_list) > 0], -# regexpr("[[:lower:]]{2}", state_list[nchar(state_list) > 0]) -# ) -# state_list[state_list == ""] <- NA -# -# pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2], -# regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2])) -# pc_list[nchar(pc_list) < 3] <- "" -# pc_list[pc_list == ""] <- NA -# -# -# -# -# city_list <- lapply(list_address1, function(x) x[length(x)]) -# city_list <- trimws(city_list, which = "both") -# city_list[city_list == "character(0)"] <- NA -# -# list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) -# -# dept_list <- lapply(list_address2, function(x) x[length(x)]) -# dept_list <- trimws(dept_list, which = "both") -# dept_list[dept_list == "character(0)"] <- NA -# -# # All remaining info is just shoved in this category -# remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) -# remain_list <- trimws(remain_list, which = "both") -# remain_list[remain_list == "character(0)"] <- NA -# -# -# a_df <- data.frame( -# adID = ID, -# university = university_list, -# country = country_list, -# city = city_list, -# state = state_list, -# postal_code = pc_list, -# department = dept_list, -# remain = remain_list, -# address = addresses, -# stringsAsFactors = FALSE -# ) -# -# -# -# -# # # extracting postal codes - USA ------------------------------------------- -# -# # # USA -------------------------------------------------------------------- -# # -# # process_usa_address <- function(x) { -# # if (length(x) == 1) { -# # return(c(NA, NA)) # Not usa -# # } -# # if (grepl(" usa", x[[length(x)]])) { -# # if (length(x) == 4) { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) -# # } -# # if (length(x) == 5) { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) -# # } -# # if (length(x) == 3) { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) -# # } else { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) -# # } -# # } -# # -# # return(c(NA, NA)) # Not usa -# # } -# # -# # -# # # Apply the function across all addresses using `mapply` -# # results <- t(mapply(process_usa_address, list_address)) -# # colnames(results) <- c("usa_city", "usa_state") -# # -# # results<-as.data.frame(results) -# # results$pc<-NA -# # results$country<-NA -# # extract_usa_postcodes <- function(df, usa_state, pc,country) { -# # # 6 digits -# # pattern <- "[0-9]{5}" -# # -# # # Loop through each row of the dataframe -# # for(i in 1:nrow(df)) { -# # # Find all matches of the pattern in the source column -# # matches <- gregexpr(pattern, df[i, usa_state]) -# # # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # # Extract the matches -# # extracted_codes <- regmatches(df[i, usa_state], matches) -# # # If there's at least one match and the target column is NA, copy the first match to the target column -# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, pc])) { -# # df[i, pc] <- extracted_codes[[1]][1] -# # df[i, country] <- "usa" -# # # df[i, city_col] <- df[i, source_col] -# # -# # } -# # } -# # return(df) -# # } -# # -# # -# # results <- extract_usa_postcodes(results, "usa_state", "pc","country") -# # -# # -# # -# # # Update `city_list` if necessary -# # city_list <- ifelse(is.na(city_list), results$usa_city, city_list) -# # # -# # # # Update `state_list` if necessary -# # state_list<-ifelse(is.na(state_list),results$usa_state, state_list) -# # # -# # # # Update `pc_list` if necessary -# # pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) -# # # Update `country_list` if necessary -# # country_list<-ifelse(is.na(country_list),results$country, country_list) -# # -# # -# # -# # # Because formats of address printing is different across platforms -# # # We are going to split using a tier system assuming first and last -# # # info is somewhat reliable and guess the other info from the -# # # remaining position of the info -# # -# # -# # -# # -# # any without numbers gets NA'd -# # results$pc[!grepl("\\d", results$pc)] <- NA -# -# extract_usa_postcodes <- function(df, source, dest1, dest2,dest3) { -# # state and zip -# pattern <- "[a-z]{2} [0-9]{5}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# if (df[i, dest3]=="usa"){ -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# } -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { -# df[i, dest1] <- extracted_codes[[1]][1] -# } -# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { -# # df[i, dest2] <- df[i, source] -# # } -# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { -# # df[i, dest3] <- "usa" -# # } -# } -# return(df) -# } -# -# -# -# a_df <- extract_usa_postcodes(a_df, "city", "postal_code", "state","country") -# a_df <- extract_usa_postcodes(a_df, "country","postal_code","state","country") -# -# a_df$city<- ifelse(a_df$country=="usa" & ((a_df$city==a_df$state) & (a_df$state==a_df$postal_code)), -# a_df$department,a_df$city) -# -# # keep portions with numbers / remove city names -# a_df$state<-ifelse(a_df$country=="usa", sub(" .*", "",results$usa_state), a_df$state) -# a_df$state<-ifelse(a_df$country=="usa", sub(" usa","",results$usa_state), a_df$state) -# results$usa_state<-sub("[0-9]{5}","",results$usa_state) -# results$usa_state<-sub("usa","",results$usa_state) -# results$usa_state<-trimws(results$usa_state, which = "both") -# -# results$country<-ifelse(is.na(results$usa_city)==FALSE,"usa",results$country) -# -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$usa_city, city_list) -# # -# # # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$usa_state, state_list) -# # -# # # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) -# # Update `country_list` if necessary -# -# country_list<-ifelse((grepl("usa",country_list)),"usa", country_list) -# # remove any with "state_abbrev zip code" but no USA -# country_list <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", country_list, ignore.case = TRUE), "usa", country_list) -# -# -# -# us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", -# "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", -# "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", -# "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", -# "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") -# -# country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) -# -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$usa_city, city_list) -# # -# # # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$usa_state, state_list) -# # -# # # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) -# # Update `country_list` if necessary -# -# rm(results) -# -# -# # a_df$postal_code <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), a_df$country, a_df$postal_code) -# # a_df$state <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), a_df$country, a_df$state) -# # a_df$state <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$state, ignore.case = TRUE), gsub("[0-9]{5}","",a_df$state), a_df$state) -# # a_df$postal_code <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$postal_code, ignore.case = TRUE), gsub("[a-z]{2}","",a_df$postal_code), a_df$postal_code) -# # a_df$country <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), "usa", a_df$country) -# # -# # a_df$state <-trimws(a_df$state,which = "both") -# # a_df$postal_code<- trimws(a_df$postal_code,which = "both") -# # a_df$country <- trimws(a_df$country,which = "both") -# -# -# -# # Postal Codes letters-numbers -------------------------------------------- -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col,sequence) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- sequence -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# } -# } -# return(df) -# } -# -# # usage -# -# sequence<-"\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" -# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) -# sequence<-"\\bbr[-][0-9]{2} [0-9]{5}\\b" -# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) -# sequence<-"\\b[0-9]{5}-[0-9]{3}\\b" -# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) -# -# -# # postal codes - numbers after -------------------------------------------- -# sequence<-"\\b [0-9]{3,8}\\b" -# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) -# -# -# -# # postal codes - uk+ ------------------------------------------------------ -# -# # Define the function -# extract_uk_postcodes <- function(df, source_col, target_col,city_col,sequence) { -# # Regular expression pattern for UK postal codes -# # One or two initial letters. -# # One or two digits (and possibly a letter). -# # A mandatory space. -# # One digit followed by two letters. -# pattern <- sequence -# -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# # Example usage -# sequence <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" -# # extract_uk_postcodes <- function(df, source_col, target_col,city_col,sequence) -# -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{2}[0-9R]{3}[A-Za-z]{2}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{1}[0-9]{3} [0-9]{1}[A-Za-z]{2}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[0-9]{4} [0-9]{2}[A-Za-z]{1}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{2} [A-Za-z]{1} [0-9]{3}[A-Za-z]{2}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# -# # postal codes - canada --------------------------------------------------- -# a_df$postal_code <- ifelse(a_df$country=="canada", NA,a_df$postal_code) -# -# a_df$state <- ifelse(a_df$country=="canada" & -# grepl("[A-Za-z]{2}", a_df$city, ignore.case = TRUE), -# a_df$city, a_df$state) -# -# a_df$city <- ifelse(a_df$country=="canada" & -# a_df$city==a_df$state, -# NA, a_df$city) -# -# -# -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col,sequence) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- sequence -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# } -# } -# return(df) -# } -# -# # usage -# -# sequence<-"\\b[A-Za-z]{1}[0-9]{1}[A-Za-z]{1} [0-9]{1}[A-Za-z]{1}[0-9]{1}\\b" -# a_df <- extract_postcodes(a_df, "state","postal_code",sequence) -# -# sequence<-"\\b[A-Za-z]{1}[0-9]{1}[A-Za-z]{1} [0-9]{1}[A-Za-z]{1}[0-9]{1}\\b" -# a_df <- extract_postcodes(a_df, "department","postal_code",sequence) -# -# a_df$state<-ifelse(a_df$country=="canada", sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", a_df$state),a_df$state) # all after first space -# -# -# a_df$city<-ifelse(a_df$country=="canada" & is.na(a_df$city),a_df$department,a_df$city) # all after first space -# -# a_df$city<-ifelse(a_df$country=="canada", sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", a_df$city),a_df$city) # all after first space -# -# a_df$state<-ifelse(a_df$country=="canada", gsub("[0-9]", "", a_df$state),a_df$state) -# -# -# -# # postal codes - india ---------------------------------------------------- -# -# a_df$state<-ifelse(a_df$country=="india", a_df$city,a_df$state) -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col,city_col,sequence) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- sequence -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, city_col] <- df[i, source_col] -# } -# } -# return(df) -# } -# -# # usage -# -# sequence<-"[0-9]{5,8}" -# a_df <- extract_postcodes(a_df, "department","postal_code","city",sequence) -# -# -# a_df$department<-ifelse(a_df$country=="india" & a_df$city==a_df$department,NA,a_df$department) -# a_df$city<-ifelse(a_df$country=="india" & a_df$city==a_df$state,NA,a_df$city) -# a_df$city<-ifelse(a_df$country=="india" & is.na(a_df$city),a_df$department,a_df$city) -# -# -# # Define the function -# extract_postcodes2 <- function(df, source_col, target_col,city_col,sequence) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- sequence -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0) { -# df[i, city_col] <- df[i, source_col] -# } -# } -# return(df) -# } -# -# # usage -# -# sequence<-"[0-9]{5,8}" -# a_df <- extract_postcodes2(a_df, "postal_code","city","city",sequence) -# -# a_df$city <- ifelse(a_df$country=="india" & (grepl("delhi", a_df$city)|grepl("delhi", a_df$state)), -# "new delhi", a_df$city) -# -# -# -# -# -# # postal codes - australia ------------------------------------------------- -# -# a_df$state<-ifelse(a_df$country=="australia", a_df$city,a_df$state) -# a_df$city<-ifelse(a_df$country=="australia", a_df$department,a_df$city) -# -# a_df$state<-ifelse(a_df$country=="australia", gsub("\\s[0-9]{4,5}", "", a_df$state),a_df$state) -# a_df$city<-ifelse(a_df$country=="australia", gsub("\\s[0-9]{4,5}", "", a_df$city),a_df$city) -# -# -# # Brazil ------------------------------------------------------------------ -# -# -# -# -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg","andar","empresas", -# "programa", "ciencias", "unidade", "lab ") -# -# a_df$department <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$department), -# NA, -# a_df$department) -# -# -# -# -# -# -# # Define the function -# extract_brl_postcodes <- function(df, source_col, target_col) { -# # 6 digits -# -# pattern <- "br-[0-9]{5,8}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# # df[i, brl_new_city] <- df[i, source_col] -# # df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# -# # df, source_col, target_col,city_col -# a_df <- extract_brl_postcodes(a_df, "department","postal_code") -# -# -# -# -# -# -# -# -# -# -# -# -# -# results <- extract_brl_postcodes(results, "brl_state","brl_pc") -# -# -# results$brl_new_city <- ifelse(is.na(results$brl_new_city), -# results$brl_state, -# results$brl_new_city) -# -# -# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, -# results$brl_city, -# results$brl_state) -# -# -# -# -# results$brl_new_city <- ifelse(is.na(results$brl_new_city), -# results$brl_city, -# results$brl_new_city) -# -# -# -# -# results$brl_city<-gsub("br-","",results$brl_city) -# results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) -# results$brl_city<-sub("-","",results$brl_city) -# -# -# results$brl_new_city<-gsub("br-","",results$brl_new_city) -# results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) -# results$brl_new_city<-sub("-","",results$brl_new_city) -# -# -# results$brl_pc<-gsub("br-","",results$brl_pc) -# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) -# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) -# -# results$brl_state<-gsub("br-","",results$brl_state) -# results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) -# results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) -# -# # -# # -# # -# # -# # -# # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) -# # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) -# # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) -# # # -# # results$brl_city<-gsub("-","",results$brl_city) -# # -# # results$brl_state<-gsub("br-","",results$brl_state) -# # results$brl_state<-gsub("-","",results$brl_state) -# -# -# -# -# -# # any without numbers gets NA'd -# results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA -# -# # keep portions with numbers / remove city names -# -# results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) -# results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) -# # -# # # Specific replacements -# # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), -# # "rio de janeiro", results$brl_city) -# -# results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), -# "rio de janeiro", results$brl_city) -# results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), -# "rj", results$brl_state) -# results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), -# "sp", results$brl_state) -# -# -# -# results$brl_city[results$brl_city==results$brl_state]<-NA -# -# -# # Clean up and adjust columns -# results[] <- lapply(results, trimws) -# -# -# # Define city-to-state mapping -# city_state_mapping <- data.frame( -# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), -# state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), -# stringsAsFactors = FALSE -# ) -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$city[i], results$brl_city) -# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brl_state) -# } -# -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# -# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brl_state) -# } -# -# -# -# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, -# results$brl_city, -# results$brl_state) -# -# -# results$brl_city <- trimws(results$brl_city, which = "both") -# results$brl_state <- trimws(results$brl_state, which = "both") -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg", -# "programa", "ciencias", "unidade", "lab ") -# -# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), -# results$brl_state, -# results$brl_city) -# -# -# # Final trimming -# results[] <- lapply(results, trimws) -# -# results$brl_city <- ifelse(results$brl_new_city==results$brl_city, -# NA, -# results$brl_city) -# From a1dddf2d1e4b36d9a8c7955593dd5e743e135962 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 7 Mar 2025 16:41:47 -0500 Subject: [PATCH 13/34] updating address parsing --- R/authors_address_update.R | 3752 ++++++++++++++++++++++++++++++++++++ 1 file changed, 3752 insertions(+) create mode 100644 R/authors_address_update.R diff --git a/R/authors_address_update.R b/R/authors_address_update.R new file mode 100644 index 0000000..7888f02 --- /dev/null +++ b/R/authors_address_update.R @@ -0,0 +1,3752 @@ +#' Parses out address information and splits it into its respective parts. +#' This is an internal function used by \code{authors_clean} +#' +#' \code{authors_address} This function takes the output from +#' \code{references_read} and pulls out address information. Splitting it into +#' university, department, city, state, etc. +#' @param addresses the addresses +#' @param ID the authorID +#' @noRd +authors_address <- function(addresses, ID) { + # DELETE + # library(tidyverse) + # library(refsplitr) + final <- read.csv("./data/wos_txt/final.csv") + addresses <- final$address + ID <- final$authorID + addresses <- tolower(addresses) + + + message("\nSplitting addresses\n") + list_address <- strsplit(addresses, ",") + + + + # remove punctuation ---------------------------------------- + + ## First remove periods and trim white space from countries. + ## helps avoids mistakes later on + + remove_period_from_last <- function(list_address) { + lapply(list_address, function(x) { + if (length(x) > 0) { + x[length(x)] <- gsub("\\.$", "", x[length(x)]) + x[length(x)] <- trimws(x[length(x)], which = "both") + } + return(x) + }) + } + + list_address <- remove_period_from_last(list_address) + + # trim ws ----------------------------------------------------------------- + + list_address <- lapply(list_address, trimws) + + # correct countries ------------------------------------------------------- + + + # correct or update names of some countries to make it possible to georef + + # Define the function + correct_countries <- function(my_list, replacements) { + # Loop through each element of the list + for (i in 1:length(my_list)) { + # Get the length of the current element + len <- length(my_list[[i]]) + + # Check if the last item matches any of the target words + if (len > 0 && my_list[[i]][len] %in% names(replacements)) { + # Replace the last item with the corresponding replacement word + my_list[[i]][len] <- replacements[[my_list[[i]][len]]] + } + } + return(my_list) + } + # NB: also updated country names. + # czechia = new name for czech republic + # united arab rep = current country name depends on city + replacements <- c( + "austl" = "australia", + "c z" = "czechia", + "cz" = "czechia", + "czech republic" = "czechia", + "fed rep ger" = "germany", + "columbia" = "colombia", + "peoples r china" = "china", + "u arab emirates" = "united arab emirates", + "mongol peo rep" = "mongolia", + "dominican rep" = "dominican republic", + "fr polynesia" = "french polynesia", + "neth antilles" = "netherland antilles", + "trinid & tobago" = "trinidad & tobago", + "rep congo" = "congo", + "north ireland" = "northern ireland", + "syrian arab rep" = "syria" + ) + + list_address <- correct_countries(list_address, replacements) + + # extract university ------------------------------------------------------ + + university_list <- vapply(list_address, function(x) x[1], character(1)) + + # extract department ------------------------------------------------------ + + # If department is listed it is typically second + # (EB note: only if 4+ slots) + # this will be 2x checked later + + dept_extract <- function(x) { + if (length(x) < 4) { + return(NA) + } else { + return(trimws(x[[2]])) + } + } + + dept_list <- unlist(lapply(list_address, dept_extract)) + + dept_list <- trimws(dept_list, which = "both") + + + # Extract City ------------------------------------------------------------ + + # If there is only one element, then it can't have both city and country' + city_list <- vapply(list_address, function(x) { + n <- length(x) + if (n == 1) { + return("no city") # placeholder to replace with NA after function + } + + # In some cases city is next-to-last element, in others next-to-next-to-last + last_element <- x[[n]] + second_last <- if (n > 1) x[[n - 1]] else NA + third_last <- if (n > 2) x[[n - 2]] else NA + + # Default case + return(second_last) + }, character(1)) + + # Cleanup + city_list <- trimws(city_list, which = "both") + city_list[city_list == "no city"] <- NA + + + # extract state ----------------------------------------------------------- + + # If there is only one element, then it can't have both city and country' + state_list <- vapply(list_address, function(x) { + n <- length(x) + if (n == 1) { + return("no state") # placeholder to replace with NA after function + } + + # In some cases city is next-to-last element, in others next-to-next-to-last + last_element <- x[[n]] + second_last <- if (n > 1) x[[n - 1]] else NA + third_last <- if (n > 2) x[[n - 2]] else NA + + # Default case + return(third_last) + }, character(1)) + + # Cleanup + state_list <- trimws(state_list, which = "both") + state_list[state_list == "no state"] <- NA + + # this is used to double check later - sometimes city is extracted as state + city_list2 <- trimws(state_list, which = "both") + + # Extract Country --------------------------------------------------------- + + country_list <- vapply( + list_address, function(x) { + gsub("\\_", "", x[length(x)]) + }, + character(1) + ) + + + # postal code (pc) list --------------------------------------------------- + + # pc often with city + + pc_list <- city_list + + # bind all into df -------------------------------------------------------- + + a_df <- data.frame( + adID = ID, + university = university_list, + country = country_list, + state = state_list, + postal_code = pc_list, + city = city_list, + city2 = city_list2, + department = dept_list, + address = addresses, + stringsAsFactors = FALSE + ) + + + + # any PC without numbers gets NA'd + a_df$postal_code[!grepl("\\d", a_df$postal_code)] <- NA + + # copy over PC and state + a_df$state <- ifelse(grepl("usa", a_df$country) & nchar(a_df$state) > 2, + NA, + a_df$state + ) + + + a_df$postal_code <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), + a_df$country, a_df$postal_code + ) + + a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), + a_df$country, a_df$state + ) + + a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5}", a_df$city), + a_df$city, a_df$state + ) + + + a_df$state <- ifelse(grepl("[a-z]{2} usa", a_df$country), + a_df$country, a_df$state + ) + + # remove the numbers and letters as appropriate + + + a_df$country <- ifelse(grepl(" usa", a_df$country), + "usa", a_df$country + ) + + a_df$state <- ifelse(a_df$country == "usa" & grepl("[a-z]{2} [0-9]{5}", a_df$state), + gsub("[[:digit:]]{5}", "", a_df$state), a_df$state + ) + + a_df$state <- ifelse(a_df$country == "usa" & grepl(" usa", a_df$state), + gsub(" usa", "", a_df$state), a_df$state + ) + + + a_df$postal_code <- ifelse(a_df$country == "usa", + gsub( + "[[:alpha:]]{2} ", "", + a_df$postal_code + ), a_df$postal_code + ) + + a_df$postal_code <- ifelse(a_df$country == "usa", + gsub( + " usa", "", + a_df$postal_code + ), a_df$postal_code + ) + + + + + a_df$city <- ifelse(a_df$country == "usa" & grepl("[a-z]{2} [0-9]{5}", a_df$city), + a_df$city2, a_df$city + ) + + + pattern <- "[a-z]{2} [0-9]{5}" + + a_df$postal_code <- ifelse(grepl(pattern, a_df$country), + a_df$country, a_df$postal_code + ) + a_df$state <- ifelse(grepl(pattern, a_df$country), + a_df$country, a_df$state + ) + a_df$country <- ifelse(grepl(pattern, a_df$country), + "usa", a_df$country + ) + a_df$postal_code <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$postal_code), + gsub("[a-z]", "", a_df$postal_code), a_df$postal_code + ) + a_df$state <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$state), + gsub("[0-9]", "", a_df$postal_code), a_df$state + ) + + + # TODO: correct this to catch any that didn;t get caught + + # + # us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", + # "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", + # "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", + # "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", + # "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") + # + # country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) + + + + # BRAZIL clean-up --------------------------------------------------------- + + a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2, + a_df$city, a_df$state + ) + a_df$city <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2, + a_df$city2, a_df$city + ) + a_df$city2 <- ifelse(a_df$country == "brazil" & a_df$city == a_df$city2, + NA, a_df$city2 + ) + a_df$postal_code <- ifelse(a_df$country == "brazil" & is.na(a_df$postal_code), + a_df$city, a_df$postal_code + ) + a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$state) > 2, + NA, a_df$state + ) + + a_df$postal_code <- ifelse(a_df$country == "brazil", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "brazil", + gsub("[-]", "", a_df$postal_code), + a_df$postal_code + ) + + a_df$city <- ifelse(a_df$country == "brazil", + gsub("br-", "", a_df$city), + a_df$city + ) + a_df$city <- ifelse(a_df$country == "brazil", + gsub("[0-9]", "", a_df$city), + a_df$city + ) + + + a_df$state <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), + a_df$city, a_df$state + ) + a_df$postal_code <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$city <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), + a_df$city2, a_df$city + ) + + + # repeat the clean of city + a_df$city <- ifelse(a_df$country == "brazil", + gsub("br-", "", a_df$city), + a_df$city + ) + a_df$city <- ifelse(a_df$country == "brazil", + gsub("[0-9]", "", a_df$city), + a_df$city + ) + + + a_df$postal_code <- ifelse(a_df$country == "brazil", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "brazil", + gsub("[-]", "", a_df$postal_code), + a_df$postal_code + ) + + a_df[] <- lapply(a_df, trimws) + + + # Define city-to-state mapping + city_state_mapping <- data.frame( + city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), + state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), + stringsAsFactors = FALSE + ) + + # Match cities and states + for (i in 1:nrow(city_state_mapping)) { + a_df$city <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), + city_state_mapping$city[i], a_df$city + ) + a_df$state <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), + city_state_mapping$state[i], a_df$state + ) + } + + # Match cities and states + for (i in 1:nrow(city_state_mapping)) { + a_df$state <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$city, ignore.case = TRUE), + city_state_mapping$state[i], a_df$state + ) + } + + + + # # Define words indicating this is actually a dept not state or postal code + # # will use this list to delete the ones that don't apply + # to_check <- c("dept","ctr","inst","ppg","andar","empresas", + # "programa", "ciencias", "unidade", "lab ") + # + # a_df$city <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$city), + # a_df$state, + # results$brl_city) + # + # + # + # results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), + # results$brl_city, + # results$brl_state) + + # + # results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), + # "rio de janeiro", results$brl_city) + # results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), + # "rj", results$brl_state) + # results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), + # "sp", results$brl_state) + # + + + # results$brl_city[results$brl_city==results$brl_state]<-NA + + + # Clean up and adjust columns + # results[] <- lapply(results, trimws) + + + + + # brazil_delete<- c("barretos canc hosp","univ fed sao paulo", + # "escola filosofia letras & ciencias humanas", + # "hosp sirio libanes","perola byington hosp", + # "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", + # "lab","zoologia", "inst", "programa","ppg", "ppg") + + + # city_clean$city_list <- ifelse(city_clean$country_list=="brazil", + # gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), + # city_clean$city_list) + + # AUSTRALIA clean-up--------------------------------------------------------------- + + a_df$state <- ifelse(a_df$country == "australia", + a_df$city, a_df$state + ) + a_df$postal_code <- ifelse(a_df$country == "australia", + a_df$city, a_df$postal_code + ) + a_df$city <- ifelse(a_df$country == "australia", + a_df$city2, a_df$city + ) + a_df$city2 <- ifelse(a_df$country == "australia", + NA, a_df$city2 + ) + + + a_df$postal_code <- ifelse(a_df$country == "australia", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$state <- ifelse(a_df$country == "australia", + gsub("[0-9]", "", a_df$state), + a_df$state + ) + + a_df[] <- lapply(a_df, trimws) + + + + # CANADA clean-up --------------------------------------------------------- + + a_df$state <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2, + a_df$city, a_df$state + ) + + a_df$city <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2, + NA, a_df$city + ) + + a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), + a_df$city2, a_df$postal_code + ) + + a_df$state <- ifelse(a_df$country == "canada" & a_df$city2 == a_df$state, + NA, a_df$state + ) + + a_df$city <- ifelse(a_df$country == "canada", a_df$city2, a_df$city) + a_df$city2 <- ifelse(a_df$country == "canada", NA, a_df$city2) + + + a_df$city <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city), + gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b", "", a_df$city), + a_df$city + ) + + a_df$state <- ifelse(a_df$country == "canada" & is.na(a_df$state), + a_df$postal_code, + a_df$state + ) + + a_df$state <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$state), + gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b", "", a_df$state), + a_df$state + ) + + a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{2,20})\\b \\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$postal_code), + gsub("\\b(\\w{1,2}|\\w{4,})\\b", "", a_df$postal_code), + a_df$postal_code + ) + + a_df[] <- lapply(a_df, trimws) + + # TODO: a few postal codes still have letters from city + + + a_df$postal_code <- ifelse(a_df$country == "canada", gsub(" ", "", a_df$postal_code), a_df$postal_code) + + + # UK clean-up ------------------------------------------------------------- + + uk <- c("scotland", "england", "wales", "northern ireland") + pattern <- "[a-z0-9]{2,4} [a-z0-9]{3,4}" + # + # a_df$postal_code <- ifelse(a_df$country %in% uk & + # grepl(pattern, a_df$city2),a_df$city2, + # a_df$postal_code) + + a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$state), + a_df$state, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city), + a_df$city, a_df$postal_code + ) + + a_df$postal_code <- ifelse(a_df$country %in% uk, + ifelse(!grepl("\\d", a_df$postal_code), NA, a_df$postal_code), + a_df$postal_code + ) + + a_df$city <- ifelse(a_df$country %in% uk & a_df$city == a_df$postal_code, + NA, a_df$city + ) + + a_df$state <- ifelse(a_df$country %in% uk & a_df$state == a_df$postal_code, + NA, a_df$state + ) + + + a_df$state <- ifelse(a_df$country == "england", a_df$city, a_df$state) + a_df$city <- ifelse(a_df$country == "england", NA, a_df$city) + a_df$city <- ifelse(a_df$country == "england", a_df$postal_code, a_df$city) + a_df$city <- ifelse(a_df$country == "england", + gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), + a_df$city + ) + + # TODO: england still needs work + + a_df$state <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", NA, a_df$state) + a_df$state <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales" & is.na(a_df$state), a_df$city, a_df$state) + a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", a_df$postal_code, a_df$city) + a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales" & is.na(a_df$city), a_df$city2, a_df$city) + a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", + gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), + a_df$city + ) + + + # postal codes clean uk --------------------------------------------------- + + + # Define the function + keep_numerical_parts <- function(df, control_col, country, target_col) { + # Apply the function to each row using sapply or a loop + df[[target_col]] <- sapply(1:nrow(df), function(i) { + if (df[[control_col]][i] == country) { + # Use gregexpr to find all parts of the string that include a numeral + matches <- gregexpr("\\b\\S*\\d\\S*\\b", df[[target_col]][i]) + # Extract the matched parts + result <- regmatches(df[[target_col]][i], matches) + # Combine the matched parts into a single string + result <- unlist(result) + result <- paste(result, collapse = " ") + result <- gsub(" ", "", result) + return(result) + } else { + return(df[[target_col]][i]) + } + }) + + return(df) + } + + + a_df <- keep_numerical_parts(a_df, "country", "scotland", "postal_code") + a_df <- keep_numerical_parts(a_df, "country", "england", "postal_code") + a_df <- keep_numerical_parts(a_df, "country", "northern ireland", "postal_code") + a_df <- keep_numerical_parts(a_df, "country", "wales", "postal_code") + + + + + + + + # INDIA clean-up ---------------------------------------------------------- + + + a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city), + a_df$city, a_df$postal_code + ) + + + a_df$city2 <- ifelse(a_df$country == "india" & a_df$state == a_df$city2, + a_df$state, a_df$city2 + ) + a_df$state <- ifelse(a_df$country == "india", NA, a_df$state) + a_df$state <- ifelse(a_df$country == "india" & is.na(a_df$postal_code), + a_df$city, a_df$state + ) + a_df$city <- ifelse(a_df$country == "india" & a_df$state == a_df$city, + NA, a_df$city + ) + a_df$city <- ifelse(a_df$country == "india" & grepl("[0-9]{4,10}", a_df$postal_code), + a_df$postal_code, a_df$city + ) + a_df$city <- ifelse(a_df$country == "india" & is.na(a_df$city), + a_df$city2, a_df$city + ) + + + a_df$postal_code <- ifelse(a_df$country == "india", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$city <- ifelse(a_df$country == "india", + gsub("[0-9]", "", a_df$city), + a_df$city + ) + + a_df$city <- ifelse(a_df$country == "india" & (grepl("delhi", a_df$city) | grepl("delhi", a_df$state)), + "new delhi", a_df$city + ) + + + # CHINA clean-up ---------------------------------------------------------- + + + + + + a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$city), + a_df$city, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$state), + a_df$state, a_df$postal_code + ) + + + a_df$city2 <- ifelse(a_df$country == "china" & a_df$state == a_df$city2, + a_df$state, a_df$city2 + ) + a_df$state <- ifelse(a_df$country == "china", NA, a_df$state) + a_df$state <- ifelse(a_df$country == "china" & is.na(a_df$postal_code), + a_df$city, a_df$state + ) + a_df$city <- ifelse(a_df$country == "china" & a_df$state == a_df$city, + NA, a_df$city + ) + a_df$city <- ifelse(a_df$country == "china" & grepl("[0-9]{4,10}", a_df$postal_code), + a_df$postal_code, a_df$city + ) + a_df$city <- ifelse(a_df$country == "china" & is.na(a_df$city), + a_df$city2, a_df$city + ) + + + a_df$postal_code <- ifelse(a_df$country == "china", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$city <- ifelse(a_df$country == "china", + gsub("[0-9]", "", a_df$city), + a_df$city + ) + + + + a_df$city <- ifelse(a_df$country == "china" & grepl("beijing", a_df$state), + "beijing", a_df$city + ) + + + # Define words indicating this is actually a dept not state or postal code + # will use this list to delete the ones that don't apply + to_delete <- c( + "&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program", "minist", "educ", "sch ", "grad ", "fac ", + "assoc", "forest" + ) + + + pattern <- paste(to_delete, collapse = "|") + # Apply the ifelse function to update + # a_df$city <- ifelse(a_df$country == "china" & + # grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), + # NA, a_df$city) + # + # + # a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), + # a_df$state, a_df$city) + # + + + a_df[] <- lapply(a_df, trimws) + # Clean chn_pc1 and chn_pc2 + + + + # TODO: check this, allows verifying iof what is in state is actually the city + + # chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", + # "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", + # "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", + # "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", + # "gansu", "inner mongolia", "jilin", "hainan", "ningxia", + # "qinghai", "tibet", "macao") + # pattern <- paste(to_delete, collapse = "|") + # a_df$state<- ifelse(a_df$country=="china" & + # grepl(pattern, a_df$state, ignore.case = TRUE, perl = TRUE), + # NA, a_df$state) + # TODO: fix. not necessary but useful. + # All the cities in the addresses, add as needed. + # chn_cities <- unique(c((a_df$country=="china" & a_df$city), "lhasa")) + + + + # pc is letters dash numbers ---------------------------------------------- + + + pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" + + a_df$postal_code <- ifelse(grepl(pattern, a_df$city), + a_df$city, a_df$postal_code + ) + + a_df$postal_code <- ifelse(grepl(pattern, a_df$state), + a_df$state, a_df$postal_code + ) + + + # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city==a_df$postal_code), + # a_df$postal_code,a_df$state) + + a_df$state <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), + a_df$city, a_df$state + ) + + + a_df$city <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), + a_df$postal_code, a_df$city + ) + + a_df$city2 <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$city), + NA, a_df$city2 + ) + + # # + # # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$state), + # # NA,a_df$state) + # # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city==a_df$state), + # # NA,a_df$state) + # # + # a_df$city<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$city), + # a_df$city2,a_df$city) + + + a_df$city <- ifelse(grepl(pattern, a_df$city), + gsub("[0-9]", "", a_df$city), + a_df$city + ) + a_df$city <- gsub("[a-z]{1,2}- ", "", a_df$city) + + + # a_df$postal_code<- gsub("[a-z]","", a_df$postal_code) + + a_df$city <- gsub("[-]", "", a_df$city) + a_df[] <- lapply(a_df, trimws) + + + pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" + + a_df$postal_code <- ifelse(grepl(pattern, a_df$postal_code), + gsub("[a-z]", "", a_df$postal_code), + a_df$postal_code + ) + + a_df$postal_code <- gsub("[-]", "", a_df$postal_code) + a_df[] <- lapply(a_df, trimws) + + + + + + # final check of postal codes (consecutive nos.) -------------------------- + + + # Define the function + extract_consecutive_numbers <- function(df, source, destination) { + df[[destination]] <- sapply(1:nrow(df), function(i) { + # Use gregexpr to find sequences of 4 or more consecutive numbers + if (is.na(df[[destination]][i])) { + matches <- gregexpr("\\d{4,}", df[[source]][i]) + # Extract the matched sequences + result <- regmatches(df[[source]][i], matches) + # Flatten the list of matches into a character vector + result <- unlist(result) + # Combine the matched sequences into a single string + result <- paste(result, collapse = " ") + return(result) + } else { + return(df[[destination]][i]) + } + }) + return(df) + } + + a_df <- extract_consecutive_numbers(a_df, "state", "postal_code") + + + + # clean the city ---------------------------------------------------------- + + # remove any digits + + a_df$city <- gsub("[0-9]", "", a_df$city) + + + + # clean up postal code ---------------------------------------------------- + + + a_df$postal_code <- ifelse(grepl("\\b[a-zA-Z]+\\s+[0-9]+\\b", a_df$postal_code), + gsub("\\b[a-zA-Z]+\\s", "", a_df$postal_code), + a_df$postal_code + ) + + + + + # NETHERLANDS clean-up ---------------------------------------------------- + + + # cities often have two characters at start (ascii version of ligature/dipthong) + + a_df[] <- lapply(a_df, trimws) + a_df$city <- ifelse(a_df$country == "netherlands" & grepl("^[a-zA-Z]{2} ", a_df$city), + (sub("^[a-zA-Z]{2} ", "", a_df$city)), a_df$city + ) + + + + a_df[] <- lapply(a_df, trimws) + + + + # Final clean-up of some US cities and states ----------------------------- + + + # Remove panama canal zone from usa states (for stri) + a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "cz"), + NA, a_df$state + ) + + # armed forces & diplomatic + a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "aa"), + NA, a_df$state + ) + + a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "apo"), + NA, a_df$city + ) + + a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "dpo"), + NA, a_df$city + ) + + a_df$city <- ifelse(a_df$city == "university pk", + "university park", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "college stn", + "college station", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "n chicago", + "north chicago", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "college pk", + "college park", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "research triangle pk" | a_df$city == "res triangle pk", + "research triangle park", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "state coll", + "state college", + a_df$city + ) + + + + a_df$city <- ifelse(grepl("sioux ctr", a_df$city), + (sub("sioux ctr", "sioux city", a_df$city)), a_df$city + ) + + + + a_df$city <- ifelse(grepl("sioux ctr", a_df$city), + (sub("sioux ctr", "sioux city", a_df$city)), a_df$city + ) + + + # Final clean-up of some Brazil cities and states ------------------------- + + + + a_df$city <- ifelse(a_df$city == "gavea rio de janeiro", + "rio de janeiro", + a_df$city + ) + + + a_df$city <- ifelse((a_df$country == "brazil" & a_df$city == "s jose campos"), + "sao jose dos campos", + a_df$city + ) + + a_df$city <- ifelse((a_df$country == "brazil" & (a_df$city == "rio de janerio" | + a_df$city == "rio de janiero" | + a_df$city == "rio der janeiro" | + a_df$city == "rio janeiro" | + a_df$city == "rio janiero")), + "rio de janeiro", + a_df$city + ) + + # Final clean-up of some INDIA cities and states -------------------------- + + + + a_df$city <- ifelse((a_df$city == "dehra dun" & a_df$country == "india"), + "dehradun", + a_df$city + ) + + + # Final clean-up of some CANADA cities and states ------------------------- + + + a_df$city <- ifelse((a_df$city == "st john" & a_df$country == "canada"), + "st. john's", + a_df$city + ) + + + # Final clean-up of some UK cities and states ----------------------------- + + + a_df$state <- ifelse(a_df$state == "london ", + "london", + a_df$state + ) + + a_df$city <- ifelse((a_df$state == "london" & a_df$country == "england"), + "london", + a_df$city + ) + + + # final clean-up of some MEXICO cities and states ------------------------- + + + a_df$city <- ifelse(a_df$country == "mexico" & a_df$city == "df", + "mexico city", + a_df$city + ) + + + # final clean-up of some ARGENTINA cities and states ---------------------- + + + + a_df$city <- ifelse(a_df$country == "argentina" & a_df$city == "df", + "buenos aires", a_df$city + ) + + + # final clean up of some ABBREVIATIONS in city names ---------------------- + + + a_df$city <- ifelse(grepl("^st ", a_df$city), + (sub("^st ", "saint ", a_df$city)), a_df$city + ) + + a_df$city <- ifelse(grepl(" st ", a_df$city), + (sub(" st ", " saint ", a_df$city)), a_df$city + ) + + a_df$city <- ifelse(grepl("^ste ", a_df$city), + (sub("^ste ", "saint ", a_df$city)), a_df$city + ) + + + + # removing departments etc allocated to city or state --------------------- + + # use strings of words typical of institutions or departmewnts to remove + + tech_words <- c( + " lab ", "lab ", " lab", "dept", "hosp", " inst", "inst ", "ctr", + "unit", "ltd", "minist", "educ", "grad ", " sch ", "sch ", " sch", + "coll ", " sci ", "natl", "&", " med", "med ", + "publ", "dept", "biomed", "phys", "technol", + "engn" + ) + pattern <- paste(tech_words, collapse = "|") + + a_df$city <- ifelse((a_df$city != "esch sur alzette" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + a_df$state, a_df$city + ) + + + a_df$state <- ifelse(a_df$state == a_df$city2, NA, a_df$state) + + a_df$state <- ifelse(grepl("[[:digit:]]", a_df$state), + NA, a_df$state + ) + + a_df$state <- ifelse(a_df$state == "", NA, a_df$state) + + a_df$postal_code <- ifelse(a_df$postal_code == "", NA, a_df$postal_code) + + + + # still some us states not extracting properly but fixed here ------------- + + + + us_state_abbreviations_lower <- c( + "al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", + "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", + "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", + "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", + "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy" + ) + pattern <- paste(us_state_abbreviations_lower, collapse = "|") + a_df$country_list <- country_list + a_df$state <- ifelse((a_df$country == "usa" & is.na(a_df$state) & grepl(pattern, a_df$address, ignore.case = TRUE, perl = TRUE)), + a_df$country_list, a_df$state + ) + + + a_df$state <- ifelse((a_df$country == "usa" & grepl("[[:digit:]]", a_df$state)), + gsub("[[:digit:]]", "", a_df$state), a_df$state + ) + a_df$state <- ifelse((a_df$country == "usa" & grepl("usa", a_df$state)), + gsub("usa", "", a_df$state), a_df$state + ) + a_df$state <- trimws(a_df$state, which = "both") + + + + + # Japanese prefectures & cities sometimes swapped in address -------------- + + + + to_delete <- c( + "&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program", "minist", "educ", "sch ", "grad ", "fac ", + "assoc", "forest", "corp" + ) + pattern <- paste(to_delete, collapse = "|") + a_df$city2 <- ifelse((a_df$country == "japan" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA, a_df$city2 + ) + + # Remove any with numbers + a_df$city2 <- ifelse((a_df$country == "japan" & grepl("[[:digit:]]", a_df$city2)), + NA, a_df$city2 + ) + + japan_prefectures <- c( + "hokkaido", "aomori", "iwate", "miyagi", "akita", + "yamagata", "fukushima", "ibaraki", "tochigi", "gunma", + "saitama", "chiba", "tokyo", "kanagawa", "niigata", + "toyama", "ishikawa", "fukui", "yamanashi", "nagano", "gifu", + "shizuoka", "aichi", "mie", "shiga", "kyoto", "osaka", "gumma", + "hyogo", "nara", "wakayama", "tottori", "shimane", + "okayama", "hiroshima", "yamaguchi", "tokushima", "kagawa", + "ehime", "kochi", "fukuoka", "saga", "nagasaki", "kumamoto", + "oita", "miyazaki", "kagoshima", "okinawa" + ) + pattern <- paste(japan_prefectures, collapse = "|") + + + a_df$state <- ifelse((a_df$country == "japan" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + a_df$city, a_df$state + ) + + + # This removes all special regions of a city like tokyo from city2 + a_df$city2 <- ifelse((a_df$country == "japan" & grepl(" ku", a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA, a_df$city2 + ) + + # replace from city with city2 EXCEPT in cases where no state (and therefore + # city is correct) and where no city2 (otherwise would bring in NA) + + a_df$city <- ifelse((a_df$country == "japan" & !(is.na(a_df$state)) & !(is.na(a_df$city2))), + a_df$city2, a_df$city + ) + + + + # fine-tuning SCOTLAND ---------------------------------------------------- + + + + a_df$city <- ifelse((a_df$country == "scotland" & grepl("univ ", a_df$city, ignore.case = TRUE, perl = TRUE)), + gsub("univ ", "", a_df$city), a_df$city + ) + + to_delete <- c( + " ave", " grp", "hlth", " rd", "mrc", " oba", "plz", + " dr", "oqb", " quad", "fisheries" + ) + + pattern <- paste(to_delete, collapse = "|") + a_df$city <- ifelse((a_df$country == "scotland" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + NA, a_df$city + ) + + + + # fine-tuning ENGLAND ----------------------------------------------------- + + + to_delete <- c( + "&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program", "minist", "educ", "sch ", "grad ", "fac ", + " sq", "quarter", " way", " dr", "diagnost", "consultant", + "microsoft", "diagnost", "[[:digit:]]", "project", "facil", "grp", + "campus", "expt", " pk", "canc", "assoc", "forest", "corp", + "consortium", "partners", "lane", "ucl", "street", "trust", + "business", "inform", "royal", "survey", "drosophila", " st", + "ndorms", "nat hist", "hlth", " ave", "council", "unit", "nerc", "nat res" + ) + pattern <- paste(to_delete, collapse = "|") + a_df$city2 <- ifelse((a_df$country == "england" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA, a_df$city2 + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)), + a_df$city2, a_df$city + ) + + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("london", a_df$address, ignore.case = TRUE, perl = TRUE), + "london", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("cambridge", a_df$address, ignore.case = TRUE, perl = TRUE), + "cambridge", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("oxford", a_df$address, ignore.case = TRUE, perl = TRUE), + "oxford", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("durham", a_df$address, ignore.case = TRUE, perl = TRUE), + "durham", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("bristol", a_df$address, ignore.case = TRUE, perl = TRUE), + "bristol", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("lancaster", a_df$address, ignore.case = TRUE, perl = TRUE), + "lancaster", a_df$city + ) + + + + # delete columns used to 2x check ---------------------------------------- + + a_df$city2 <- NULL + a_df$country_list <- NULL + + + # return output of function ----------------------------------------------- + + + return(a_df) +} + + + + + + + + + + +# country<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country) %>% +# mutate(summary=nchar(country)) %>% +# arrange(country) +# +# +# country_city<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,city) +# +# +# +# country_state<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country,state) %>% +# mutate(city_char=nchar(state)) %>% +# arrange(country,state) +# +# country_state_city<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country ,state,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,state,city) +# +# +# country_state_city_pc<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country ,state,postal_code,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,state,postal_code,city) + + + + + +# old code ---------------------------------------------------------------- + +# +# +# +# +# last_element %in% c("india", "china", +# +# +# a_df$state<- ifelse(a_df$state==a_df$city2,NA,a_df$state) +# +# +# +# ## Australia postal codes also separated +# # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) +# +# # First need to "fix" the city +# # Function to check for three characters or numbers in the city-list and replace with NA +# process_aus_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not Australia +# } +# +# if (x[[length(x)]] == "australia") { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } +# +# return(c(NA, NA)) # Not Australia +# } +# +# # Apply the function across all addresses using `mapply` +# results <- t(mapply(process_aus_address, list_address)) +# colnames(results) <- c("aus_city", "aus_state") +# +# # Clean up results +# results <- as.data.frame(results, stringsAsFactors = FALSE) +# results$aus_city[results$aus_city == "not australia"] <- NA +# results$aus_state[results$aus_state == "not australia"] <- NA +# +# # take the PC+state and assign to PC +# results$aus_pc<-results$aus_state +# # remove all digits from state +# results$aus_state <- trimws(gsub("\\d", "", results$aus_state)) +# # remove all letters from pc +# results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) +# results$aus_pc[results$aus_pc == ""] <- NA +# +# # if na in PC, assign the city (some of which have PC) +# results$aus_pc <- ifelse(is.na(results$aus_pc), results$aus_city, results$aus_pc) +# # remove all metters from pc, leaving any new pc +# results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) +# results$aus_pc[results$aus_pc == ""] <- NA +# # now remove any PC from city +# results$aus_city <- trimws(gsub("\\d", "", results$aus_city)) #alternative +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$aus_city, city_list) +# +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$aus_state, state_list) +# +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$aus_pc, pc_list) +# +# rm(results) +# +# +# +# # CANADA ------------------------------------------------------------------ +# +# ## Canada postal codes also separated +# # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) +# +# process_can_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not Canada +# } +# +# if (x[[length(x)]] == "canada") { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } +# +# return(c(NA, NA)) # Not canada +# } +# +# # Apply the function across all addresses using `mapply` +# results <- t(mapply(process_can_address, list_address)) +# colnames(results) <- c("can_city", "can_state") +# +# # Clean up results +# results <- as.data.frame(results, stringsAsFactors = FALSE) +# results$can_city[results$can_city == "not canada"] <- NA +# results$can_state[results$can_state == "not canada"] <- NA +# +# # take the PC+state and assign to PC +# results$can_pc<-results$can_state +# +# # any without numbers gets NA'd +# results$can_pc[!grepl("\\d", results$can_pc)] <- NA +# # removes state and removes ltr at start of PC +# results$can_pc<-sub("[a-z]\\s+[a-z]", "", results$can_pc) #+[a-z] +# +# +# # if na in PC, assign the city (some of which have PC) +# results$can_pc <- ifelse(is.na(results$can_pc), results$can_city, results$can_pc) +# results$can_pc[!grepl("\\d", results$can_pc)] <- NA +# +# # keep portions with numbers / remove city names +# # .*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$: This regex pattern matches any +# # characters (.*) followed by a word boundary (\\b) and exactly three word +# # characters (\\w{3}), capturing this as the first group ((\\w{3})). It +# # then matches any characters again (.*) followed by another word boundary +# # and exactly three word characters, capturing this as the second +# # group ((\\w{3})), and ensures this is at the end of the string ($). +# # sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", string): Replaces the +# # entire string with the two captured groups separated by a space. +# results$can_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$can_pc) +# +# # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr +# +# results$can_pc[results$can_pc == ""] <- NA +# # now remove any PC from city +# results$can_city<-sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", results$can_city) # all after first space +# +# +# # fix state +# results$can_state <- trimws(gsub("\\d", "", results$can_state)) +# results$can_state <-trimws(sub(" .*", "", results$can_state)) # all after first space +# results$can_state <- gsub("british", "bc", results$can_state) +# results$can_state <- gsub("nova", "ns", results$can_state) +# # fix city +# results$can_city <- trimws(gsub("\\d", "", results$can_city)) +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$can_city, city_list) +# +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$can_state, state_list) +# +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$can_pc, pc_list) +# +# rm(results) +# +# # INDIA ------------------------------------------------------------------ +# +# # India states are almost always listed but New Delhi is complicated, +# # as are any with only three entries +# # Function to process addresses for both city and state +# process_india_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not India +# } +# +# if (x[[length(x)]] == "india") { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } +# +# if (length(x) != 3 && grepl("delhi", x[[length(x) - 1]])) { +# return(c(trimws(x[[length(x) - 1]]), NA)) +# } +# +# return(c(NA, NA)) # Not India +# } +# +# # Apply the function across all addresses using `mapply` +# results <- t(mapply(process_india_address, list_address)) +# colnames(results) <- c("india_city", "india_state") +# +# # Clean up results +# results <- as.data.frame(results, stringsAsFactors = FALSE) +# results$india_city[results$india_city == "not india"] <- NA +# results$india_state[results$india_state == "not india"] <- NA +# +# # Remove numeric parts from state names and trim whitespace +# results$india_state <- trimws(gsub("\\s([0-9]+)", "", results$india_state)) +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$india_city, city_list) +# #### +# +# state_list<-ifelse(is.na(state_list),results$india_state, state_list) +# +# rm(results) +# +# +# # BRAZIL ------------------------------------------------------------------ +# +# +# # Function to process addresses for both city and state +# process_brazil_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not brl +# } +# if (x[[length(x)]] == "brazil") { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } +# +# +# return(c(NA, NA)) # Not brl +# } +# +# +# +# # Apply the function across all addresses +# results <- as.data.frame(t(mapply(process_brazil_address, list_address)), +# stringsAsFactors = FALSE) +# colnames(results) <- c("brl_city", "brl_state") +# +# +# +# +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg","andar","empresas", +# "programa", "ciencias", "unidade", "lab ") +# +# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), +# results$brl_state, +# results$brl_city) +# +# +# +# results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), +# results$brl_city, +# results$brl_state) +# +# +# # Define the function +# extract_brl_postcodes <- function(df, source_col, target_col,brl_new_city) { +# # 6 digits +# +# pattern <- "br-[0-9]{5,8}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, brl_new_city] <- df[i, source_col] +# # df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# results$brl_pc<-NA +# results$brl_new_city<-NA +# # df, source_col, target_col,city_col +# results <- extract_brl_postcodes(results, "brl_city","brl_pc", "brl_new_city") +# results <- extract_brl_postcodes(results, "brl_state","brl_pc", "brl_new_city") +# +# +# results$brl_new_city <- ifelse(is.na(results$brl_new_city), +# results$brl_state, +# results$brl_new_city) +# +# +# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, +# results$brl_city, +# results$brl_state) +# +# +# +# +# results$brl_new_city <- ifelse(is.na(results$brl_new_city), +# results$brl_city, +# results$brl_new_city) +# +# +# +# +# results$brl_city<-gsub("br-","",results$brl_city) +# results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) +# results$brl_city<-sub("-","",results$brl_city) +# +# +# results$brl_new_city<-gsub("br-","",results$brl_new_city) +# results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) +# results$brl_new_city<-sub("-","",results$brl_new_city) +# +# +# results$brl_pc<-gsub("br-","",results$brl_pc) +# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) +# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) +# +# results$brl_state<-gsub("br-","",results$brl_state) +# results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) +# results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) +# +# # +# # +# # +# # +# # +# # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) +# # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) +# # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) +# # # +# # results$brl_city<-gsub("-","",results$brl_city) +# # +# # results$brl_state<-gsub("br-","",results$brl_state) +# # results$brl_state<-gsub("-","",results$brl_state) +# +# +# +# +# +# # any without numbers gets NA'd +# results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA +# +# # keep portions with numbers / remove city names +# +# results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) +# results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) +# # +# # # Specific replacements +# # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), +# # "rio de janeiro", results$brl_city) +# +# results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), +# "rio de janeiro", results$brl_city) +# results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), +# "rj", results$brl_state) +# results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), +# "sp", results$brl_state) +# +# +# +# results$brl_city[results$brl_city==results$brl_state]<-NA +# +# +# # Clean up and adjust columns +# results[] <- lapply(results, trimws) +# +# +# # Define city-to-state mapping +# city_state_mapping <- data.frame( +# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), +# state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), +# stringsAsFactors = FALSE +# ) +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$city[i], results$brl_city) +# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brl_state) +# } +# +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# +# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brl_state) +# } +# +# +# +# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, +# results$brl_city, +# results$brl_state) +# +# +# results$brl_city <- trimws(results$brl_city, which = "both") +# results$brl_state <- trimws(results$brl_state, which = "both") +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg", +# "programa", "ciencias", "unidade", "lab ") +# +# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), +# results$brl_state, +# results$brl_city) +# +# +# # Final trimming +# results[] <- lapply(results, trimws) +# +# results$brl_city <- ifelse(results$brl_new_city==results$brl_city, +# NA, +# results$brl_city) +# +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$brl_city, city_list) +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$brl_state, state_list) +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$brl_pc, pc_list) +# +# +# rm(results,city_state_mapping) +# +# +# +# # Handle postal codes (BR-[0-9]) +# results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_state), results$brazil_state, NA) +# results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_city) & is.na(results$brazil_pc), +# results$brazil_city, results$brazil_pc) +# +# # Remove BR codes from city and state +# results$brazil_city <- gsub("br-[0-9]+", "", results$brazil_city) +# results$brazil_state <- gsub("br-[0-9]+", "", results$brazil_state) +# +# # Define city-to-state mapping +# city_state_mapping <- data.frame( +# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "sao paulo"), +# state = c("sp", "sp", "sp", "sp", "rj", "rj", "sp"), +# stringsAsFactors = FALSE +# ) +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# results$brazil_city <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), +# city_state_mapping$city[i], results$brazil_city) +# results$brazil_state <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brazil_state) +# } +# +# # Specific replacements +# results$brazil_city <- ifelse(grepl("museu nacl", results$brazil_city), +# "rio de janeiro", results$brazil_city) +# results$brazil_state <- ifelse(grepl("rio de janeiro", results$brazil_state, ignore.case = TRUE), +# "rj", results$brazil_state) +# results$brazil_state <- ifelse(grepl("sao paulo", results$brazil_state, ignore.case = TRUE), +# "sp", results$brazil_state) +# +# # cleanup +# results$brazil_city[results$brazil_city==results$brazil_state]<-NA +# results$brazil_city <- trimws(results$brazil_city, which = "both") +# results$brazil_state <- trimws(results$brazil_state, which = "both") +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg", +# "programa", "ciencias", "unidade", "lab ") +# +# results$brazil_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brazil_city), +# results$brazil_state, +# results$brazil_city) +# +# # Clean postal codes +# results$brazil_pc <- gsub("[A-Za-z-]", "", results$brazil_pc) +# +# # Final trimming +# results[] <- lapply(results, trimws) +# +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$brazil_city, city_list) +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$brazil_state, state_list) +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$brazil_pc, pc_list) +# +# +# rm(results,city_state_mapping) +# +# +# # CHINA ------------------------------------------------------------------- +# chn_extract <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) +# } else if (x[[length(x)]] == "china") { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) +# } else { +# return(c(NA, NA)) +# } +# } +# +# chn_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) +# names(chn_pc) <- c("chn_city", "chn_state") +# +# # Define the function +# extract_china_postcodes <- function(df, source_col, target_col,chn_new_city) { +# # 6 digits +# pattern <- "[0-9]{6}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, chn_new_city] <- df[i, source_col] +# # df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# chn_pc$chn_pc<-NA +# chn_pc$chn_new_city<-NA +# # df, source_col, target_col,city_col +# chn_pc <- extract_china_postcodes(chn_pc, "chn_city","chn_pc", "chn_new_city") +# chn_pc <- extract_china_postcodes(chn_pc, "chn_state","chn_pc", "chn_new_city") +# +# +# +# # any without numbers gets NA'd +# chn_pc$chn_pc[!grepl("\\d", chn_pc$chn_pc)] <- NA +# +# # keep portions with numbers / remove city names +# chn_pc$chn_new_city<-sub("[0-9]{6}","",chn_pc$chn_new_city) +# chn_pc$chn_city<-sub("[0-9]{6}","",chn_pc$chn_city) +# chn_pc$chn_state<-sub("[0-9]{6}","",chn_pc$chn_state) +# +# +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# +# to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", +# "dept", "div", "univ", "hosp", "coll", "sci", "rd", +# "program","minist", "educ", "sch ", "grad ", "fac ", +# "assoc","forest") +# +# # Define the function +# +# +# # Print the resulting dataframe +# print(a_df) +# +# +# chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")]<-lapply(chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")], trimws) +# +# +# clean_column <- function(column, delete_terms) { +# column <- gsub(paste(delete_terms, collapse = "|"), NA, column) +# column <- gsub("[0-9]", "", column) # Remove digits +# trimws(column) # Remove leading/trailing whitespace +# } +# +# +# # Clean chn_pc1 and chn_pc2 +# chn_pc$chn_city <- clean_column(chn_pc$chn_city, to_delete) +# chn_pc$chn_new_city <- clean_column(chn_pc$chn_new_city, to_delete) +# chn_pc$chn_state <- clean_column(chn_pc$chn_state, to_delete) +# +# +# chn_pc$chn_new_city <- ifelse(is.na(chn_pc$chn_new_city), +# chn_pc$chn_state, +# chn_pc$chn_new_city) +# +# +# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), +# NA,chn_pc$chn_state) +# +# +# +# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_city)==FALSE), +# chn_pc$chn_city,chn_pc$chn_state) +# +# +# +# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), +# NA,chn_pc$chn_state) +# +# chn_pc$chn_state <- gsub(" province", "",chn_pc$chn_state) +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), chn_pc$chn_new_city, city_list) +# +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),chn_pc$chn_state, state_list) +# +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),chn_pc$chn_pc, pc_list) +# +# rm(chn_pc) +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # ### China has some where the postal code is with the city, so fix those here +# # # Extract postal code information from the list +# # chn_extract <- function(x) { +# # if (length(x) == 1) { +# # return(c(NA, NA)) +# # } else if (x[[length(x)]] == "china") { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) +# # } else { +# # return(c(NA, NA)) +# # } +# # } +# +# # Apply extraction to list_address +# # chn_missing_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) +# # names(chn_missing_pc) <- c("chn_pc1", "chn_pc2") +# # +# # # Define words indicating this is actually a dept not state or postal code +# # # will use this list to delete the ones that don't apply +# # to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", +# # "dept", "div", "univ", "hosp", "coll", "sci", "rd","program" +# # "minist", "educ", "sch ", "grad ", "fac ","assoc") +# # +# # +# # +# # # Extract numeric postal codes +# # chn_missing_pc$chn_pc_from1 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc1)) +# # chn_missing_pc$chn_pc_from2 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc2)) +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # +# # clean_column <- function(column, delete_terms) { +# # column <- gsub(paste(delete_terms, collapse = "|"), NA, column) +# # column <- gsub("[0-9]", "", column) # Remove digits +# # trimws(column) # Remove leading/trailing whitespace +# # } +# # +# # # Clean chn_pc1 and chn_pc2 +# # chn_missing_pc$chn_pc1 <- clean_column(chn_missing_pc$chn_pc1, to_delete) +# # chn_missing_pc$chn_pc2 <- clean_column(chn_missing_pc$chn_pc2, to_delete) +# # +# # # Initialize empty columns for final outputs +# # chn_missing_pc$chn_pc <- NA +# # chn_missing_pc$chn_city <- NA +# # chn_missing_pc$chn_state <- NA +# # +# # # Assign postal codes, cities, and states based on conditions +# # assign_chn_data <- function(from1, from2, pc1, pc2) { +# # list( +# # chn_pc = ifelse(is.na(from1) & !is.na(from2), from2, from1), +# # chn_city = ifelse(is.na(from1) & !is.na(from2), pc2, pc1), +# # chn_state = ifelse(is.na(from1) & !is.na(from2), pc1, pc2) +# # ) +# # } +# # +# # chn_result <- assign_chn_data(chn_missing_pc$chn_pc_from1, +# # chn_missing_pc$chn_pc_from2, +# # chn_missing_pc$chn_pc1, +# # chn_missing_pc$chn_pc2) +# # +# # chn_missing_pc$chn_pc <- chn_result$chn_pc +# # chn_missing_pc$chn_city <- gsub("[0-9]", "", chn_result$chn_city) +# # chn_missing_pc$chn_state <- gsub("[0-9]", "", chn_result$chn_state) +# +# # # Define Chinese states and cities +# # chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", +# # "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", +# # "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", +# # "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", +# # "gansu", "inner mongolia", "jilin", "hainan", "ningxia", +# # "qinghai", "tibet", "macao") +# # +# # # All the cities in the addresses, add as needed. +# # chn_cities <- unique(c(chn_missing_pc$chn_city, "lhasa")) +# +# # Update states and cities based on matching conditions +# # chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc1 %in% chn_states, +# # chn_missing_pc$chn_pc1, chn_missing_pc$chn_state) +# # chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc2 %in% chn_states, +# # chn_missing_pc$chn_pc2, chn_missing_pc$chn_state) +# # +# # chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc1 %in% chn_states), +# # chn_missing_pc$chn_pc1, chn_missing_pc$chn_city) +# # chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc2 %in% chn_states), +# # chn_missing_pc$chn_pc2, chn_missing_pc$chn_city) +# # +# # # put the postal codes and cities in the pc_list, state_list +# # pc_list<-ifelse(is.na(pc_list),chn_missing_pc$chn_pc, pc_list) +# # city_list<-ifelse(is.na(city_list),chn_missing_pc$chn_city, city_list) +# # state_list<-ifelse(is.na(state_list),chn_missing_pc$chn_state, state_list) +# # +# # +# # rm(chn_cities,chn_states,to_delete,chn_result,chn_missing_pc) +# +# +# +# +# +# +# +# +# +# # UK ------------------------------------------------------------------ +# +# process_uk_address <- function(x) { +# if (length(x) == 1) { +# return(c(NA, NA)) # Not uk +# } +# +# if ((x[[length(x)]] == "england")| +# (x[[length(x)]] == "scotland")| +# (x[[length(x)]] == "wales")| +# (x[[length(x)]] == "northern ireland")) { +# if (length(x) == 3) { +# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) +# } else { +# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) +# } +# } +# +# return(c(NA, NA)) # Not uk +# } +# +# # Apply the function across all addresses using `mapply` +# results <- t(mapply(process_uk_address, list_address)) +# colnames(results) <- c("uk_city", "uk_state") +# +# +# +# +# # Clean up results +# results <- as.data.frame(results, stringsAsFactors = FALSE) +# results$uk_city[results$uk_city == "not uk"] <- NA +# results$uk_state[results$uk_state == "not uk"] <- NA +# +# +# +# results$uk_pc<-NA +# +# # Define the function +# extract_uk_postcodes <- function(df, source_col, target_col,city_col) { +# # Regular expression pattern for UK postal codes +# # One or two initial letters. +# # One or two digits (and possibly a letter). +# # A mandatory space. +# # One digit followed by two letters. +# pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" +# pattern2 <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" +# pattern3 <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" +# +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# # Example usage +# +# results <- extract_uk_postcodes(results, "uk_city","uk_pc", "uk_city") +# results <- extract_uk_postcodes(results, "uk_state","uk_pc", "uk_city") +# +# # any without numbers gets NA'd +# results$uk_pc[!grepl("\\d", results$uk_pc)] <- NA +# +# +# +# results$new_city<-NA +# +# +# # Define the function +# uk_city_id <- function(df, source_col, target_col) { +# # Regular expression pattern for UK postal codes +# # One or two initial letters. +# # One or two digits (and possibly a letter). +# # A mandatory space. +# # One digit followed by two letters. +# pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- df[i, source_col] +# } +# } +# return(df) +# } +# +# # Example usage +# +# results <- uk_city_id(results, "uk_city","new_city") +# results <- uk_city_id(results, "uk_state","new_city") +# +# +# results$new_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$new_city) +# +# +# +# +# # Define the function +# uk_city_id <- function(df, source_col, target_col) { +# # Regular expression pattern for UK postal codes +# # One or two initial letters. +# # One or two digits (and possibly a letter). +# # A mandatory space. +# # One digit followed by two letters. +# pattern <- "\\s[A-Za-z0-9]{2,4}\\s[A-Za-z0-9]{2,4}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- df[i, source_col] +# } +# } +# return(df) +# } +# +# # Example usage +# results <- uk_city_id(results, "uk_state","new_city") +# +# +# +# results$uk_state<-ifelse(results$uk_state==results$uk_city, +# "",results$uk_state) +# +# results$new_city<-ifelse(is.na(results$new_city), +# results$uk_city, +# results$new_city) +# # remove zip codes from new city +# results$new_city<-gsub("\\s[A-Za-z0-9]{3}\\s[A-Za-z0-9]{3}","",results$new_city) +# results$new_city<-gsub("\\s[A-Za-z0-9]{4}\\s[A-Za-z0-9]{2,3}","",results$new_city) +# +# +# results$new_city<-ifelse(results$uk_state=="london", +# "london", +# results$new_city) +# +# +# results$uk_state<-ifelse(results$uk_state=="london", +# NA, +# results$uk_state) +# +# +# +# +# +# +# +# +# +# +# # keep portions with numbers / remove city names +# # results$uk_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_city) +# # results$uk_state<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_state) +# # results$uk_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$uk_pc) +# +# # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr +# +# results$uk_pc[results$uk_pc == ""] <- NA +# # now remove any PC from city +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$new_city, city_list) +# +# # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$uk_state, state_list) +# +# # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$uk_pc, pc_list) +# +# rm(results) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# # Extracts postal code when combined with city name ----------------------- +# +# city_list <- trimws(city_list, which = "both") +# +# +# +# city_clean<-data.frame( +# authorID=ID, +# addresses=addresses, +# original_city=city_list, +# city_list=city_list, +# state_list=state_list, +# country_list=country_list, +# extract_pc=pc_list) +# +# +# # # England, Scotland, Wales --------------------------------------------- +# # +# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc)& (city_clean$country_list=="england"| +# # city_clean$country_list=="wales" | +# # city_clean$country_list=="scotland"), +# # gsub(".*\\s([a-z0-9]+\\s[a-z0-9]+)$", "\\1", city_clean$city_list), +# # city_clean$extract_pc) +# # +# # # This then deletes the postal code from the city name +# # city_clean$city_list<-ifelse((city_clean$country_list=="england"| +# # city_clean$country_list=="wales" | +# # city_clean$country_list=="scotland"), +# # (gsub("([A-Za-z0-9]+\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), +# # city_clean$city_list) +# # +# city_clean[city_clean == ""] <- NA +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- "\\b[A-Za-z]{1,2}[-][0-9]{4,8}\\b" +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# } +# } +# return(df) +# } +# +# # Example usage +# +# city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") +# +# +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- " [0-9]{3,9}" +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# } +# } +# return(df) +# } +# +# +# # Example usage +# +# city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") +# +# +# +# # Define the function +# delete_matching_text <- function(df, col_a, col_b) { +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Check if the value in column A is not NA and is found within the text in column B +# if(!is.na(df[i, col_a]) && grepl(df[i, col_a], df[i, col_b])) { +# # Remove the matching text from column B by replacing it with an empty string +# df[i, col_b] <- gsub(df[i, col_a], "", df[i, col_b]) +# } +# } +# return(df) +# } +# +# city_clean <- delete_matching_text(city_clean, "extract_pc","city_list") +# +# city_clean <- delete_matching_text(city_clean, "extract_pc","state_list") +# +# +# # remove state if same as city +# +# city_clean$state_list<-ifelse(city_clean$state_list==city_clean$city_list,NA, city_clean$state_list) +# +# +# # there are some usa ones that had state zip buyt no country +# +# +# # Define the function +# extract_postcodes <- function(df, country_list, extract_pc, state_list) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- "\\b[A-Za-z]{2} [0-9]{5}\\b" +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, country_list]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, country_list], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, extract_pc])) { +# df[i, extract_pc] <- extracted_codes[[1]][1] +# df[i, state_list] <- extracted_codes[[1]][1] +# df[i, country_list] <- "usa" +# } +# } +# return(df) +# } +# +# +# # Example usage +# +# city_clean <- extract_postcodes(city_clean, +# "country_list", "extract_pc", "state_list") +# +# +# # remove zip codes from states +# city_clean$state_list<-ifelse(city_clean$country_list=="usa", +# gsub("[0-9]","",city_clean$state_list), +# city_clean$state_list) +# +# # remove state from zipcode +# city_clean$extract_pc<-ifelse(city_clean$country_list=="usa", +# gsub("[a-z]","",city_clean$extract_pc), +# city_clean$extract_pc) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# city_clean$extract_pc <- trimws(gsub("&", "", city_clean$extract_pc)) +# city_clean[city_clean == ""] <- NA +# +# # +# # +# # +# # +# # +# head(city_clean) +# # +# # # Cleaning up city names with zip codes in them +# # # take the zip code and put it in a new column before deleting +# # +# # +# # +# # +# # # 2) Countries with postal code AFTER city -------------------------------- +# # +# # +# # city_zip<-c("ireland","qatar","kazakhstan","peru","turkey","south korea", +# # "japan","costa rica","mexico","new zealand","iran","thailand", +# # "russia","spain","india","singapore","indonesia","chile", +# # "finland","colombia","taiwan","saudi arabia","uruguay", +# # "slovenia","spain") +# # +# # +# # +# # city_clean$extract_pc<-ifelse((is.na(city_clean$extract_pc)& (city_clean$country_list %in% city_zip)), +# # (gsub(".*[A-Za-z]+", "", city_clean$city_list)), +# # city_clean$extract_pc) +# # # +# # city_clean$city_list<-ifelse((city_clean$country_list %in% city_zip), +# # gsub("\\s([0-9]+)", "", city_clean$city_list), +# # city_clean$city_list) +# # +# # city_clean[city_clean == ""] <- NA +# # +# # +# # # 3) Postal code and dash BEFORE city name -------------------------------- +# # +# # +# # zip_dash<-c("finland","slovakia","austria","portugal","belgium", +# # "spain","israel","czech republic","argentina","france", +# # "sweden","switzerland","turkey","germany","italy", +# # "lithuania","hungary","denmark","poland","norway", "iceland", +# # "greece", "ukraine","estonia","latvia","luxembourg","lativa", +# # "south africa","bulgaria","brazil") +# # +# # +# # +# # +# # city_clean$extract_pc <- ifelse((is.na(city_clean$extract_pc) & +# # (city_clean$country_list %in% zip_dash)), +# # # gsub("\\s([A-Za-z]+)", "", city_clean$city_list), +# # sub(" .*", "", city_clean$city_list), +# # city_clean$extract_pc) +# # +# # +# # city_clean$city_list<-ifelse((city_clean$country_list %in% zip_dash), +# # gsub("[a-zA-Z]+-[0-9]+", "", city_clean$city_list), +# # city_clean$city_list) +# # +# # +# # city_clean[city_clean == ""] <- NA +# # +# # # 4) Netherlands Postal Code ---------------------------------------------- +# # +# # # Netherlands has Postal Code before +# # # it is a combination of 2-3 blocks of letters and numbers +# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="netherlands"), +# # (gsub("^(([^ ]+ ){2}).*", "\\1", city_clean$city_list)), +# # city_clean$extract_pc) +# # city_clean$city_list<-ifelse((city_clean$country_list=="netherlands"), +# # (gsub(".*\\s", "", city_clean$city_list)), +# # city_clean$city_list) +# # city_clean[city_clean == ""] <- NA +# # +# # # 5) Venezuela ----------------------------------------------------------- +# # +# # # Venezuela has postal code after, it is combo of letters and numbers +# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="venezuela"), +# # gsub(".*\\s([a-z0-9]+)$", "\\1", city_clean$city_list), +# # city_clean$extract_pc) +# # +# # # This then deletes the postal code from the city name +# # city_clean$city_list<-ifelse(city_clean$country_list=="venezuela", +# # (gsub("(\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), +# # city_clean$city_list) +# # city_clean[city_clean == ""] <- NA +# # +# # +# # +# # # trim ws +# # city_clean$extract_pc <- trimws(city_clean$extract_pc, which = "both") +# # city_clean$city_list <- trimws(city_clean$city_list, which = "both") +# # # This removes any that don't have numbers in them +# # city_clean$extract_pc[!grepl("[0-9]", city_clean$extract_pc)] <- NA +# # city_clean$city_list <- gsub("[0-9]+", "", city_clean$city_list) +# # +# # +# # Final Clean Up ---------------------------------------------------------- +# +# # Russia +# city_clean$city_list<-ifelse(city_clean$country_list=="russia", +# (gsub("st Petersburg p", "st petersburg", city_clean$city_list)), +# city_clean$city_list) +# +# +# +# # India +# India_delete<- c("dept") +# +# city_clean$city_list <- ifelse(city_clean$country_list=="india", +# gsub(paste(India_delete, collapse = "|"), NA, city_clean$city_list), +# city_clean$city_list) +# city_clean$city_list<-trimws(city_clean$city_list) # Remove leading/trailing whitespace +# +# # brazil +# city_clean$extract_pc<-ifelse((city_clean$country_list=="brazil" & nchar(city_clean$extract_pc) < 5), +# "",city_clean$extract_pc) +# city_clean[city_clean == ""] <- NA +# +# brazil_delete<- c("barretos canc hosp","univ fed sao paulo", +# "escola filosofia letras & ciencias humanas", +# "hosp sirio libanes","perola byington hosp", +# "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", +# "lab","zoologia", "inst", "programa","ppg", "ppg") +# +# +# city_clean$city_list <- ifelse(city_clean$country_list=="brazil", +# gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), +# city_clean$city_list) +# +# city_clean$city_list<-trimws(city_clean$city_list, which="both") # Remove leading/trailing whitespace +# city_clean$state_list<-trimws(city_clean$state_list, which="both") # Remove leading/trailing whitespace +# city_clean$country_list<-trimws(city_clean$country_list, which="both") # Remove leading/trailing whitespace) # Remove leading/trailing whitespace +# # City Abbreviations +# +# city_clean$city_list<-ifelse(city_clean$city_list=="university pk", +# "university park", +# city_clean$city_list) +# +# +# +# city_clean$city_list<-ifelse(city_clean$city_list=="college stn", +# "college station", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse(city_clean$city_list=="n chicago", +# "north chicago", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse(city_clean$city_list=="college pk", +# "college park", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse(city_clean$city_list=="research triangle pk", +# "research triangle park", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse(city_clean$city_list=="state coll", +# "state college", +# city_clean$city_list) +# +# # city corrections +# city_clean$city_list<-ifelse((city_clean$city_list=="dehra dun" & city_clean$country_list == "india"), +# "dehradun", +# city_clean$city_list) +# +# city_clean$city_list<-ifelse((city_clean$city_list=="st john" & city_clean$country_list == "canada"), +# "st. john's", +# city_clean$city_list) +# +# city_clean$state_list<-ifelse(city_clean$state_list=="london ", +# "london", +# city_clean$state_list) +# +# city_clean$city_list<-ifelse((city_clean$state_list=="london" & city_clean$country_list == "england"), +# "london", +# city_clean$city_list) +# +# +# city_clean$state_list<-ifelse(city_clean$state_list=="london", +# NA, +# city_clean$state_list) +# +# +# +# city_list<-city_clean$city_list +# state_list<-city_clean$state_list +# pc_list<-city_clean$extract_pc +# country_list<-city_clean$country_list +# +# +# rm(brazil_delete,India_delete) +# +# +# # rm(city_clean) +# +# +# pc_list[pc_list == ""] <- NA +# city_list[city_list == ""] <- NA +# state_list[state_list == ""] <- NA +# dept_list[dept_list == ""] <- NA +# country_list[country_list == ""] <- NA +# # Create the df that will be returned +# cleaned_ad<-data.frame(ID, +# addresses, +# university_list, +# dept_list, +# city_list, +# country_list, +# state_list, +# pc_list) +# +# +# +# # names(cleaned_ad) +# +# +# list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) +# +# # Because formats of address printing is different across platforms +# # We are going to split using a tier system assuming first and last +# # info is somewhat reliable and guess the other info from the +# # remaining position of the info +# +# second_tier_list <- lapply(list_address1, function(x) x[length(x)]) +# second_tier_list <- trimws(second_tier_list, which = "both") +# second_tier_list[second_tier_list == "character(0)"] <- NA +# +# list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) +# +# third_tier_list <- lapply(list_address2, function(x) x[length(x)]) +# third_tier_list <- trimws(third_tier_list, which = "both") +# third_tier_list[third_tier_list == "character(0)"] <- NA +# +# # All remaining info is just shoved in this category +# remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) +# remain_list <- trimws(remain_list, which = "both") +# remain_list[remain_list == "character(0)"] <- NA +# +# +# # original +# # a_df <- data.frame( +# # adID = ID, university = university_list, +# # country = country_list, +# # state = state_list, postal_code = pc_list, city = NA, +# # department = NA, second_tier = second_tier_list, +# # third_tier = third_tier_list, +# # remain = remain_list, address = addresses, +# # stringsAsFactors = FALSE +# # ) +# +# # EB EDIT +# a_df_1 <- data.frame( +# adID = ID, +# university_1 = university_list, +# university = university_list, +# country_1 = country_list, +# country = country_list, +# state_1 = state_list, +# state = state_list, +# postal_code_1 = pc_list, +# postal_code = pc_list, +# city_1 = city_list, +# city = city_list, +# department_1 = dept_list, +# department = dept_list, +# second_tier = second_tier_list, +# third_tier = third_tier_list, +# remain = remain_list, +# address = addresses, +# stringsAsFactors = FALSE +# ) +# +# a_df_1$city_list<-ifelse(is.na(a_df_1$city_1), +# a_df_1$city, +# a_df_1$city_1) +# +# a_df_1$postal_code_1<-ifelse(is.na(a_df_1$postal_code_1), +# a_df_1$postal_code, +# a_df_1$postal_code_1) +# +# +# # Quebec Postal Code is PQ (Fr) or QC (En) but only QC is georeferenced +# a_df_1$state_1<-ifelse(a_df_1$state_1=="pq" & a_df_1$country_1 == "canada", +# "qc", +# a_df_1$state_1) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# a_df<-a_df_1 +# +# rm(a_df_1) +# +# +# +# # try to fix the usa spots, which vary in format than other countries +# a_df$state<-ifelse(is.na(a_df$state),"",a_df$state) # added by eb to deal with index problem +# a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0] +# a_df$state[nchar(a_df$state) == 0] <- NA +# a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA +# a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) & +# !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) & +# !is.na(a_df$postal_code) & !is.na(a_df$state)] +# # fix a US problem when usa is not tacked onto the end +# +# us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}" +# a_df$state[ grepl(us_reg, a_df$country) ] <- +# substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2) +# +# a_df$postal_code[ grepl(us_reg, a_df$country) ] <- +# substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8) +# +# a_df$country[grepl(us_reg, a_df$country)] <- "usa" +# +# +# a_df$state_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", +# a_df$state, +# a_df$state_1) +# +# a_df$postal_code_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", +# a_df$postal_code, +# a_df$postal_code_1) +# +# +# a_df$country_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", +# a_df$country, +# a_df$country_1) +# +# +# ########################## +# # We'll use regular expression to pull zipcodes +# # These formats differ by region +# int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}" +# int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]", +# "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="") +# int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}" +# int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" +# int <- paste(int1, int2, int3, int4, sep = "|") +# +# uk <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]", +# "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="") +# +# mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well +# +# panama <- "[[:digit:]]{4}-[[:digit:]]{5}" +# +# zip_search <- paste0(int, "|", uk, "|", mexico, "|", panama) +# +# +# +# +# +# # ADDRD EB INSTEAD OF +# a_df$city_1 <- ifelse(is.na(a_df$city_1),a_df$third_tier,a_df$city_1) +# a_df$state_1 <- ifelse(is.na(a_df$state_1),a_df$second_tier,a_df$state_1) +# a_df$postal_code_1 <- ifelse(is.na(a_df$postal_code_1),a_df$second_tier,a_df$postal_code_1) +# +# +# # fix country - usa +# # Function to remove everything before " usa" +# remove_before_usa <- function(x) { +# if (grepl(" usa", x)) { +# return(sub(".*(?= usa)", "", x, perl = TRUE)) +# } else { +# return(x) +# } +# } +# +# # Apply the function to each element in the vector +# a_df$country_1 <- sapply(a_df$country_1, remove_before_usa) +# a_df$country_1 <- trimws(a_df$country_1, which = "both") +# +# +# a_df$state_1 <- ifelse(a_df$country_1=="usa", +# (trimws(gsub("[0-9]", "", a_df$state_1), which="both")), +# a_df$state_1) +# +# a_df$postal_code_1 <- ifelse(a_df$country_1=="usa", +# (trimws(gsub("[a-z]", "", a_df$postal_code_1), which="both")), +# a_df$postal_code_1) +# +# +# +# ########################### +# id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) & +# a_df$address != "Could not be extracted"] +# ########################### +# +# # We now iteratively run through the addresses using the concept that +# # certain information always exists next to each other. +# # Ex. city, state, country tend to exist next to each other. +# # We use the position of the zipcode also to help guide us +# # in where the information lies as well as how many fields were +# # given to us. +# for (i in id_run) { +# found <- FALSE +# row <- which(a_df$adID == i) +# university <- a_df$university[row] +# second_tier <- a_df$second_tier[row] +# third_tier <- a_df$third_tier[row] +# remain <- a_df$remain[row] +# city <- a_df$city[row] +# state <- a_df$state[row] +# postal_code <- a_df$postal_code[row] +# department <- a_df$department[row] +# grepl(zip_search, second_tier) +# grepl(zip_search, third_tier) +# # 2nd tier +# if (grepl(zip_search, second_tier)) { +# found <- TRUE +# postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier)) +# city <- gsub(zip_search, "", second_tier) +# department <- ifelse(is.na(remain), third_tier, remain) +# } +# # 3RD tiers +# if (grepl(zip_search, third_tier) & !found) { +# found <- TRUE +# postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier)) +# city <- gsub(zip_search, "", third_tier) +# state <- second_tier +# department <- remain +# } +# +# if (!found) { +# state <- second_tier +# city <- third_tier +# department <- remain +# } +# # To make university searching more efficient we'll override values +# # based on if it has university/college in the name, +# # where university overides college +# override_univ <- grepl("\\buniv\\b|\\buniversi", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) & +# !grepl("\\bdrv\\b|\\bdrive\\b", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) +# +# if (any(override_univ)) { +# university <- +# c(second_tier, third_tier, remain, city, university)[override_univ][1] +# assign( +# c("second_tier", "third_tier", "remain", "city", "university")[ +# override_univ][1], +# NA +# ) +# } +# # only if university doesnt already exist +# override_univ_col <- +# grepl("\\bcol\\b|college|\\bcoll\\b", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) & +# !grepl("\\bdrv\\b|\\bdrive\\b", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) +# +# if (!any(override_univ) & any(override_univ_col)) { +# university <- +# c(second_tier, third_tier, remain, city, university )[ +# override_univ_col][1] +# +# assign( +# c("second_tier", "third_tier", "remain", "city", "university")[ +# override_univ_col][1], +# NA +# ) +# } +# # more risky, but institutions as well, just incase its not a university +# override_univ_inst <- grepl("\\binst\\b|\\binstitut", +# c(second_tier, third_tier, remain, city, university), +# ignore.case = TRUE) +# if ( +# !any(override_univ) & !any(override_univ_col) & any(override_univ_inst) +# ) { +# department <- c(second_tier, third_tier, remain, city, university )[ +# override_univ_inst][1] +# +# assign( +# c("second_tier", "third_tier", "remain", "city", "university")[ +# override_univ_inst][1], +# NA +# ) +# } +# +# a_df$city[row] <- gsub("[[:digit:]]", "", city) +# a_df$state[row] <- gsub("[[:digit:]]", "", state) +# a_df$postal_code[row] <- postal_code +# a_df$department[row] <- department +# +# +# +# #########################Clock############################### +# total <- length(id_run) +# pb <- utils::txtProgressBar(min = 0, max = total, style = 3) +# utils::setTxtProgressBar(pb, which(id_run == i)) +# ############################################################# +# } +# +# +# city_fix <- is.na(a_df$city) & !is.na(a_df$state) +# a_df$city[city_fix] <- a_df$state[city_fix] +# a_df$state[city_fix] <- NA +# a_df$university[a_df$university == "Could not be extracted"] <- NA +# a_df$country[a_df$country == "Could not be extracted"] <- NA +# # a_df$country[a_df$country == "peoples r china"] <- "China" +# # a_df$country[a_df$country == "U Arab Emirates"] <- "United Arab Emirates" +# # a_df$country[a_df$country == "Mongol Peo Rep"] <- "Mongolia" +# +# a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <- +# vapply(strsplit( +# a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)], +# "-"), +# function(x) x[2], character(1) +# ) +# #strip periods from the ends of city,state,country +# a_df$city <- gsub("\\.", "", a_df$city) +# a_df$state <- gsub("\\.", "", a_df$state) +# a_df$country <- gsub("\\.", "", a_df$country) +# a_df$country[a_df$country == ""] <- NA +# a_df$university[a_df$university == ""] <- NA +# a_df$postal_code[a_df$postal_code == ""] <- NA +# #convert to lower +# for (l in 2:ncol(a_df)){ +# a_df[, l] <- tolower(a_df[, l]) +# } +# +# # Select columns +# a_df <- a_df[, c("adID", +# "university_1", +# "country_1", +# "state_1", +# "postal_code_1", +# "city_1", +# "department_1", +# "second_tier", +# "third_tier", +# "remain", +# "address") +# ] +# +# # Rename columns +# colnames(a_df) <- c("adID", +# "university", +# "country", +# "state", +# "postal_code", +# "city", +# "department", +# "second_tier", +# "third_tier", +# "remain", +# "address") +# +# +# # sometimes the postal code fails to prse out of state. canm use this +# # when postal code is missing, but then need to remove +# # Function to extract numbers from one column and copy them to another column +# extract_numbers <- function(df, source_col, target_col) { +# if (is.na(target_col)) { +# +# +# df[[target_col]] <- gsub(".*?(\\d+).*", "\\1", df[[source_col]]) +# df[[target_col]][!grepl("\\d", df[[source_col]])] <- NA +# return(df) +# +# } else { +# return(df) +# } +# +# } +# +# # Apply the function to the dataframe +# a_df <- extract_numbers(a_df, "state", "postal_code") +# +# +# +# # ther postal code and city are sometimes in tier3 +# a_df$city <- ifelse(is.na(a_df$city), a_df$third_tier, a_df$city) +# a_df$postal_code <- ifelse(is.na(a_df$postal_code), a_df$third_tier, a_df$postal_code) +# +# +# +# +# +# # Function to remove matching characters from col1 based on col2 +# remove_matching <- function(col1, col2) { +# pattern <- paste0("\\b", gsub("([\\W])", "\\\\\\1", col2), "\\b") +# result <- sub(pattern, "", col1) +# trimws(result) +# } +# +# # Apply the function to each row +# a_df$city <- mapply(remove_matching, a_df$city, a_df$postal_code) +# a_df$state <- mapply(remove_matching, a_df$state, a_df$postal_code) +# +# +# +# library(tidyverse) +# +# +# country<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country) %>% +# mutate(summary=nchar(country)) %>% +# arrange(country) +# +# +# country_city<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,city) +# +# +# +# country_state<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country,state) %>% +# mutate(city_char=nchar(state)) %>% +# arrange(country,state) +# +# country_state_city<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country ,state,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,state,city) +# +# +# country_state_city_pc<- a_df %>% +# # mutate(city_match=(city==second_tier)) %>% +# # filter(city_match==FALSE) %>% +# distinct(country ,state,postal_code,city) %>% +# mutate(city_char=nchar(city)) %>% +# arrange(country,state,postal_code,city) +# +# return(a_df) +# } +# +# +# +# +# +# +# +# +# +# +# #. old +# country_list <- vapply(list_address, function(x) { +# gsub("\\_", "", x[length(x)]) }, +# character(1)) +# country_list <- trimws(country_list, which = "both") +# pc_list <- rep(NA, length(list_address)) +# state_list <- rep(NA, length(list_address)) +# city_list<- rep(NA, length(list_address)) +# country_list <- ifelse(grepl("usa", country_list), "usa", country_list) +# +# list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) +# +# +# +# pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("usa", +# country_list), function(x) x[1], numeric(1))) - 1), which = "right") +# state_list <- pc_list +# +# state_list[nchar(state_list) > 0] <- regmatches( +# state_list[nchar(state_list) > 0], +# regexpr("[[:lower:]]{2}", state_list[nchar(state_list) > 0]) +# ) +# state_list[state_list == ""] <- NA +# +# pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2], +# regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2])) +# pc_list[nchar(pc_list) < 3] <- "" +# pc_list[pc_list == ""] <- NA +# +# +# +# +# city_list <- lapply(list_address1, function(x) x[length(x)]) +# city_list <- trimws(city_list, which = "both") +# city_list[city_list == "character(0)"] <- NA +# +# list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) +# +# dept_list <- lapply(list_address2, function(x) x[length(x)]) +# dept_list <- trimws(dept_list, which = "both") +# dept_list[dept_list == "character(0)"] <- NA +# +# # All remaining info is just shoved in this category +# remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) +# remain_list <- trimws(remain_list, which = "both") +# remain_list[remain_list == "character(0)"] <- NA +# +# +# a_df <- data.frame( +# adID = ID, +# university = university_list, +# country = country_list, +# city = city_list, +# state = state_list, +# postal_code = pc_list, +# department = dept_list, +# remain = remain_list, +# address = addresses, +# stringsAsFactors = FALSE +# ) +# +# +# +# +# # # extracting postal codes - USA ------------------------------------------- +# +# # # USA -------------------------------------------------------------------- +# # +# # process_usa_address <- function(x) { +# # if (length(x) == 1) { +# # return(c(NA, NA)) # Not usa +# # } +# # if (grepl(" usa", x[[length(x)]])) { +# # if (length(x) == 4) { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) +# # } +# # if (length(x) == 5) { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) +# # } +# # if (length(x) == 3) { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) +# # } else { +# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) +# # } +# # } +# # +# # return(c(NA, NA)) # Not usa +# # } +# # +# # +# # # Apply the function across all addresses using `mapply` +# # results <- t(mapply(process_usa_address, list_address)) +# # colnames(results) <- c("usa_city", "usa_state") +# # +# # results<-as.data.frame(results) +# # results$pc<-NA +# # results$country<-NA +# # extract_usa_postcodes <- function(df, usa_state, pc,country) { +# # # 6 digits +# # pattern <- "[0-9]{5}" +# # +# # # Loop through each row of the dataframe +# # for(i in 1:nrow(df)) { +# # # Find all matches of the pattern in the source column +# # matches <- gregexpr(pattern, df[i, usa_state]) +# # # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # # Extract the matches +# # extracted_codes <- regmatches(df[i, usa_state], matches) +# # # If there's at least one match and the target column is NA, copy the first match to the target column +# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, pc])) { +# # df[i, pc] <- extracted_codes[[1]][1] +# # df[i, country] <- "usa" +# # # df[i, city_col] <- df[i, source_col] +# # +# # } +# # } +# # return(df) +# # } +# # +# # +# # results <- extract_usa_postcodes(results, "usa_state", "pc","country") +# # +# # +# # +# # # Update `city_list` if necessary +# # city_list <- ifelse(is.na(city_list), results$usa_city, city_list) +# # # +# # # # Update `state_list` if necessary +# # state_list<-ifelse(is.na(state_list),results$usa_state, state_list) +# # # +# # # # Update `pc_list` if necessary +# # pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) +# # # Update `country_list` if necessary +# # country_list<-ifelse(is.na(country_list),results$country, country_list) +# # +# # +# # +# # # Because formats of address printing is different across platforms +# # # We are going to split using a tier system assuming first and last +# # # info is somewhat reliable and guess the other info from the +# # # remaining position of the info +# # +# # +# # +# # +# # any without numbers gets NA'd +# # results$pc[!grepl("\\d", results$pc)] <- NA +# +# extract_usa_postcodes <- function(df, source, dest1, dest2,dest3) { +# # state and zip +# pattern <- "[a-z]{2} [0-9]{5}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# if (df[i, dest3]=="usa"){ +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# } +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { +# df[i, dest1] <- extracted_codes[[1]][1] +# } +# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { +# # df[i, dest2] <- df[i, source] +# # } +# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { +# # df[i, dest3] <- "usa" +# # } +# } +# return(df) +# } +# +# +# +# a_df <- extract_usa_postcodes(a_df, "city", "postal_code", "state","country") +# a_df <- extract_usa_postcodes(a_df, "country","postal_code","state","country") +# +# a_df$city<- ifelse(a_df$country=="usa" & ((a_df$city==a_df$state) & (a_df$state==a_df$postal_code)), +# a_df$department,a_df$city) +# +# # keep portions with numbers / remove city names +# a_df$state<-ifelse(a_df$country=="usa", sub(" .*", "",results$usa_state), a_df$state) +# a_df$state<-ifelse(a_df$country=="usa", sub(" usa","",results$usa_state), a_df$state) +# results$usa_state<-sub("[0-9]{5}","",results$usa_state) +# results$usa_state<-sub("usa","",results$usa_state) +# results$usa_state<-trimws(results$usa_state, which = "both") +# +# results$country<-ifelse(is.na(results$usa_city)==FALSE,"usa",results$country) +# +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$usa_city, city_list) +# # +# # # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$usa_state, state_list) +# # +# # # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) +# # Update `country_list` if necessary +# +# country_list<-ifelse((grepl("usa",country_list)),"usa", country_list) +# # remove any with "state_abbrev zip code" but no USA +# country_list <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", country_list, ignore.case = TRUE), "usa", country_list) +# +# +# +# us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", +# "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", +# "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", +# "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", +# "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") +# +# country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) +# +# +# # Update `city_list` if necessary +# city_list <- ifelse(is.na(city_list), results$usa_city, city_list) +# # +# # # Update `state_list` if necessary +# state_list<-ifelse(is.na(state_list),results$usa_state, state_list) +# # +# # # Update `pc_list` if necessary +# pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) +# # Update `country_list` if necessary +# +# rm(results) +# +# +# # a_df$postal_code <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), a_df$country, a_df$postal_code) +# # a_df$state <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), a_df$country, a_df$state) +# # a_df$state <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$state, ignore.case = TRUE), gsub("[0-9]{5}","",a_df$state), a_df$state) +# # a_df$postal_code <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$postal_code, ignore.case = TRUE), gsub("[a-z]{2}","",a_df$postal_code), a_df$postal_code) +# # a_df$country <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), "usa", a_df$country) +# # +# # a_df$state <-trimws(a_df$state,which = "both") +# # a_df$postal_code<- trimws(a_df$postal_code,which = "both") +# # a_df$country <- trimws(a_df$country,which = "both") +# +# +# +# # Postal Codes letters-numbers -------------------------------------------- +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col,sequence) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- sequence +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# } +# } +# return(df) +# } +# +# # usage +# +# sequence<-"\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" +# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) +# sequence<-"\\bbr[-][0-9]{2} [0-9]{5}\\b" +# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) +# sequence<-"\\b[0-9]{5}-[0-9]{3}\\b" +# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) +# +# +# # postal codes - numbers after -------------------------------------------- +# sequence<-"\\b [0-9]{3,8}\\b" +# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) +# +# +# +# # postal codes - uk+ ------------------------------------------------------ +# +# # Define the function +# extract_uk_postcodes <- function(df, source_col, target_col,city_col,sequence) { +# # Regular expression pattern for UK postal codes +# # One or two initial letters. +# # One or two digits (and possibly a letter). +# # A mandatory space. +# # One digit followed by two letters. +# pattern <- sequence +# +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# # Example usage +# sequence <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" +# # extract_uk_postcodes <- function(df, source_col, target_col,city_col,sequence) +# +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{2}[0-9R]{3}[A-Za-z]{2}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{1}[0-9]{3} [0-9]{1}[A-Za-z]{2}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[0-9]{4} [0-9]{2}[A-Za-z]{1}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# sequence <- "[A-Za-z]{2} [A-Za-z]{1} [0-9]{3}[A-Za-z]{2}" +# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) +# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) +# +# +# # postal codes - canada --------------------------------------------------- +# a_df$postal_code <- ifelse(a_df$country=="canada", NA,a_df$postal_code) +# +# a_df$state <- ifelse(a_df$country=="canada" & +# grepl("[A-Za-z]{2}", a_df$city, ignore.case = TRUE), +# a_df$city, a_df$state) +# +# a_df$city <- ifelse(a_df$country=="canada" & +# a_df$city==a_df$state, +# NA, a_df$city) +# +# +# +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col,sequence) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- sequence +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# } +# } +# return(df) +# } +# +# # usage +# +# sequence<-"\\b[A-Za-z]{1}[0-9]{1}[A-Za-z]{1} [0-9]{1}[A-Za-z]{1}[0-9]{1}\\b" +# a_df <- extract_postcodes(a_df, "state","postal_code",sequence) +# +# sequence<-"\\b[A-Za-z]{1}[0-9]{1}[A-Za-z]{1} [0-9]{1}[A-Za-z]{1}[0-9]{1}\\b" +# a_df <- extract_postcodes(a_df, "department","postal_code",sequence) +# +# a_df$state<-ifelse(a_df$country=="canada", sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", a_df$state),a_df$state) # all after first space +# +# +# a_df$city<-ifelse(a_df$country=="canada" & is.na(a_df$city),a_df$department,a_df$city) # all after first space +# +# a_df$city<-ifelse(a_df$country=="canada", sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", a_df$city),a_df$city) # all after first space +# +# a_df$state<-ifelse(a_df$country=="canada", gsub("[0-9]", "", a_df$state),a_df$state) +# +# +# +# # postal codes - india ---------------------------------------------------- +# +# a_df$state<-ifelse(a_df$country=="india", a_df$city,a_df$state) +# +# # Define the function +# extract_postcodes <- function(df, source_col, target_col,city_col,sequence) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- sequence +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# df[i, city_col] <- df[i, source_col] +# } +# } +# return(df) +# } +# +# # usage +# +# sequence<-"[0-9]{5,8}" +# a_df <- extract_postcodes(a_df, "department","postal_code","city",sequence) +# +# +# a_df$department<-ifelse(a_df$country=="india" & a_df$city==a_df$department,NA,a_df$department) +# a_df$city<-ifelse(a_df$country=="india" & a_df$city==a_df$state,NA,a_df$city) +# a_df$city<-ifelse(a_df$country=="india" & is.na(a_df$city),a_df$department,a_df$city) +# +# +# # Define the function +# extract_postcodes2 <- function(df, source_col, target_col,city_col,sequence) { +# # One or two initial letters. +# # mandatory dash +# # several numbers +# pattern <- sequence +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0) { +# df[i, city_col] <- df[i, source_col] +# } +# } +# return(df) +# } +# +# # usage +# +# sequence<-"[0-9]{5,8}" +# a_df <- extract_postcodes2(a_df, "postal_code","city","city",sequence) +# +# a_df$city <- ifelse(a_df$country=="india" & (grepl("delhi", a_df$city)|grepl("delhi", a_df$state)), +# "new delhi", a_df$city) +# +# +# +# +# +# # postal codes - australia ------------------------------------------------- +# +# a_df$state<-ifelse(a_df$country=="australia", a_df$city,a_df$state) +# a_df$city<-ifelse(a_df$country=="australia", a_df$department,a_df$city) +# +# a_df$state<-ifelse(a_df$country=="australia", gsub("\\s[0-9]{4,5}", "", a_df$state),a_df$state) +# a_df$city<-ifelse(a_df$country=="australia", gsub("\\s[0-9]{4,5}", "", a_df$city),a_df$city) +# +# +# # Brazil ------------------------------------------------------------------ +# +# +# +# +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg","andar","empresas", +# "programa", "ciencias", "unidade", "lab ") +# +# a_df$department <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$department), +# NA, +# a_df$department) +# +# +# +# +# +# +# # Define the function +# extract_brl_postcodes <- function(df, source_col, target_col) { +# # 6 digits +# +# pattern <- "br-[0-9]{5,8}" +# +# # Loop through each row of the dataframe +# for(i in 1:nrow(df)) { +# # Find all matches of the pattern in the source column +# matches <- gregexpr(pattern, df[i, source_col]) +# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) +# # Extract the matches +# extracted_codes <- regmatches(df[i, source_col], matches) +# # If there's at least one match and the target column is NA, copy the first match to the target column +# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { +# df[i, target_col] <- extracted_codes[[1]][1] +# # df[i, brl_new_city] <- df[i, source_col] +# # df[i, city_col] <- df[i, source_col] +# +# } +# } +# return(df) +# } +# +# +# # df, source_col, target_col,city_col +# a_df <- extract_brl_postcodes(a_df, "department","postal_code") +# +# +# +# +# +# +# +# +# +# +# +# +# +# results <- extract_brl_postcodes(results, "brl_state","brl_pc") +# +# +# results$brl_new_city <- ifelse(is.na(results$brl_new_city), +# results$brl_state, +# results$brl_new_city) +# +# +# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, +# results$brl_city, +# results$brl_state) +# +# +# +# +# results$brl_new_city <- ifelse(is.na(results$brl_new_city), +# results$brl_city, +# results$brl_new_city) +# +# +# +# +# results$brl_city<-gsub("br-","",results$brl_city) +# results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) +# results$brl_city<-sub("-","",results$brl_city) +# +# +# results$brl_new_city<-gsub("br-","",results$brl_new_city) +# results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) +# results$brl_new_city<-sub("-","",results$brl_new_city) +# +# +# results$brl_pc<-gsub("br-","",results$brl_pc) +# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) +# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) +# +# results$brl_state<-gsub("br-","",results$brl_state) +# results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) +# results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) +# +# # +# # +# # +# # +# # +# # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) +# # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) +# # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) +# # # +# # results$brl_city<-gsub("-","",results$brl_city) +# # +# # results$brl_state<-gsub("br-","",results$brl_state) +# # results$brl_state<-gsub("-","",results$brl_state) +# +# +# +# +# +# # any without numbers gets NA'd +# results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA +# +# # keep portions with numbers / remove city names +# +# results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) +# results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) +# # +# # # Specific replacements +# # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), +# # "rio de janeiro", results$brl_city) +# +# results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), +# "rio de janeiro", results$brl_city) +# results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), +# "rj", results$brl_state) +# results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), +# "sp", results$brl_state) +# +# +# +# results$brl_city[results$brl_city==results$brl_state]<-NA +# +# +# # Clean up and adjust columns +# results[] <- lapply(results, trimws) +# +# +# # Define city-to-state mapping +# city_state_mapping <- data.frame( +# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), +# state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), +# stringsAsFactors = FALSE +# ) +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$city[i], results$brl_city) +# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brl_state) +# } +# +# +# # Match cities and states +# for (i in 1:nrow(city_state_mapping)) { +# +# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), +# city_state_mapping$state[i], results$brl_state) +# } +# +# +# +# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, +# results$brl_city, +# results$brl_state) +# +# +# results$brl_city <- trimws(results$brl_city, which = "both") +# results$brl_state <- trimws(results$brl_state, which = "both") +# # Define words indicating this is actually a dept not state or postal code +# # will use this list to delete the ones that don't apply +# to_check <- c("dept","ctr","inst","ppg", +# "programa", "ciencias", "unidade", "lab ") +# +# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), +# results$brl_state, +# results$brl_city) +# +# +# # Final trimming +# results[] <- lapply(results, trimws) +# +# results$brl_city <- ifelse(results$brl_new_city==results$brl_city, +# NA, +# results$brl_city) +# From c52e58b5a1f9cb8a7d9acc9001409f1e3288bc20 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 7 Mar 2025 16:56:27 -0500 Subject: [PATCH 14/34] ready to merge with authors_address --- R/authors_address_update.R | 2665 +----------------------------------- 1 file changed, 23 insertions(+), 2642 deletions(-) diff --git a/R/authors_address_update.R b/R/authors_address_update.R index 7888f02..bbc68e0 100644 --- a/R/authors_address_update.R +++ b/R/authors_address_update.R @@ -8,14 +8,8 @@ #' @param ID the authorID #' @noRd authors_address <- function(addresses, ID) { - # DELETE - # library(tidyverse) - # library(refsplitr) - final <- read.csv("./data/wos_txt/final.csv") - addresses <- final$address - ID <- final$authorID - addresses <- tolower(addresses) + addresses <- tolower(addresses) message("\nSplitting addresses\n") list_address <- strsplit(addresses, ",") @@ -275,18 +269,6 @@ authors_address <- function(addresses, ID) { ) - # TODO: correct this to catch any that didn;t get caught - - # - # us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", - # "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", - # "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", - # "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", - # "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") - # - # country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) - - # BRAZIL clean-up --------------------------------------------------------- @@ -383,53 +365,6 @@ authors_address <- function(addresses, ID) { ) } - - - # # Define words indicating this is actually a dept not state or postal code - # # will use this list to delete the ones that don't apply - # to_check <- c("dept","ctr","inst","ppg","andar","empresas", - # "programa", "ciencias", "unidade", "lab ") - # - # a_df$city <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$city), - # a_df$state, - # results$brl_city) - # - # - # - # results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), - # results$brl_city, - # results$brl_state) - - # - # results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), - # "rio de janeiro", results$brl_city) - # results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), - # "rj", results$brl_state) - # results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), - # "sp", results$brl_state) - # - - - # results$brl_city[results$brl_city==results$brl_state]<-NA - - - # Clean up and adjust columns - # results[] <- lapply(results, trimws) - - - - - # brazil_delete<- c("barretos canc hosp","univ fed sao paulo", - # "escola filosofia letras & ciencias humanas", - # "hosp sirio libanes","perola byington hosp", - # "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", - # "lab","zoologia", "inst", "programa","ppg", "ppg") - - - # city_clean$city_list <- ifelse(city_clean$country_list=="brazil", - # gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), - # city_clean$city_list) - # AUSTRALIA clean-up--------------------------------------------------------------- a_df$state <- ifelse(a_df$country == "australia", @@ -708,38 +643,34 @@ authors_address <- function(addresses, ID) { pattern <- paste(to_delete, collapse = "|") # Apply the ifelse function to update - # a_df$city <- ifelse(a_df$country == "china" & - # grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), - # NA, a_df$city) - # - # - # a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), - # a_df$state, a_df$city) - # + a_df$city <- ifelse(a_df$country == "china" & + grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), + NA, a_df$city) + a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), + a_df$state, a_df$city) + a_df[] <- lapply(a_df, trimws) - # Clean chn_pc1 and chn_pc2 - + - # TODO: check this, allows verifying iof what is in state is actually the city - - # chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", - # "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", - # "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", - # "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", - # "gansu", "inner mongolia", "jilin", "hainan", "ningxia", - # "qinghai", "tibet", "macao") - # pattern <- paste(to_delete, collapse = "|") - # a_df$state<- ifelse(a_df$country=="china" & - # grepl(pattern, a_df$state, ignore.case = TRUE, perl = TRUE), - # NA, a_df$state) - # TODO: fix. not necessary but useful. - # All the cities in the addresses, add as needed. - # chn_cities <- unique(c((a_df$country=="china" & a_df$city), "lhasa")) + # This verifies that what is in `city` is actually a city + # (or at least that what is in `city` is NOT a province) + chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", + "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", + "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", + "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", + "gansu", "inner mongolia", "jilin", "hainan", "ningxia", + "qinghai", "tibet", "macao") + pattern <- paste(to_delete, collapse = "|") + a_df$city<- ifelse(a_df$country=="china" & + grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), + NA, a_df$city) + + # pc is letters dash numbers ---------------------------------------------- @@ -754,10 +685,6 @@ authors_address <- function(addresses, ID) { a_df$state, a_df$postal_code ) - - # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city==a_df$postal_code), - # a_df$postal_code,a_df$state) - a_df$state <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), a_df$city, a_df$state ) @@ -771,15 +698,7 @@ authors_address <- function(addresses, ID) { NA, a_df$city2 ) - # # - # # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$state), - # # NA,a_df$state) - # # a_df$state<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city==a_df$state), - # # NA,a_df$state) - # # - # a_df$city<- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2==a_df$city), - # a_df$city2,a_df$city) - + a_df$city <- ifelse(grepl(pattern, a_df$city), gsub("[0-9]", "", a_df$city), @@ -788,8 +707,6 @@ authors_address <- function(addresses, ID) { a_df$city <- gsub("[a-z]{1,2}- ", "", a_df$city) - # a_df$postal_code<- gsub("[a-z]","", a_df$postal_code) - a_df$city <- gsub("[-]", "", a_df$city) a_df[] <- lapply(a_df, trimws) @@ -805,9 +722,6 @@ authors_address <- function(addresses, ID) { a_df[] <- lapply(a_df, trimws) - - - # final check of postal codes (consecutive nos.) -------------------------- @@ -851,12 +765,7 @@ authors_address <- function(addresses, ID) { a_df$postal_code ) - - - # NETHERLANDS clean-up ---------------------------------------------------- - - # cities often have two characters at start (ascii version of ligature/dipthong) a_df[] <- lapply(a_df, trimws) @@ -864,15 +773,12 @@ authors_address <- function(addresses, ID) { (sub("^[a-zA-Z]{2} ", "", a_df$city)), a_df$city ) - - a_df[] <- lapply(a_df, trimws) # Final clean-up of some US cities and states ----------------------------- - # Remove panama canal zone from usa states (for stri) a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "cz"), NA, a_df$state @@ -1225,2528 +1131,3 @@ authors_address <- function(addresses, ID) { return(a_df) } - - - - - - - - - - -# country<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country) %>% -# mutate(summary=nchar(country)) %>% -# arrange(country) -# -# -# country_city<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,city) -# -# -# -# country_state<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country,state) %>% -# mutate(city_char=nchar(state)) %>% -# arrange(country,state) -# -# country_state_city<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country ,state,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,state,city) -# -# -# country_state_city_pc<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country ,state,postal_code,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,state,postal_code,city) - - - - - -# old code ---------------------------------------------------------------- - -# -# -# -# -# last_element %in% c("india", "china", -# -# -# a_df$state<- ifelse(a_df$state==a_df$city2,NA,a_df$state) -# -# -# -# ## Australia postal codes also separated -# # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) -# -# # First need to "fix" the city -# # Function to check for three characters or numbers in the city-list and replace with NA -# process_aus_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not Australia -# } -# -# if (x[[length(x)]] == "australia") { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# return(c(NA, NA)) # Not Australia -# } -# -# # Apply the function across all addresses using `mapply` -# results <- t(mapply(process_aus_address, list_address)) -# colnames(results) <- c("aus_city", "aus_state") -# -# # Clean up results -# results <- as.data.frame(results, stringsAsFactors = FALSE) -# results$aus_city[results$aus_city == "not australia"] <- NA -# results$aus_state[results$aus_state == "not australia"] <- NA -# -# # take the PC+state and assign to PC -# results$aus_pc<-results$aus_state -# # remove all digits from state -# results$aus_state <- trimws(gsub("\\d", "", results$aus_state)) -# # remove all letters from pc -# results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) -# results$aus_pc[results$aus_pc == ""] <- NA -# -# # if na in PC, assign the city (some of which have PC) -# results$aus_pc <- ifelse(is.na(results$aus_pc), results$aus_city, results$aus_pc) -# # remove all metters from pc, leaving any new pc -# results$aus_pc <- trimws(gsub("[a-z]", "", results$aus_pc)) -# results$aus_pc[results$aus_pc == ""] <- NA -# # now remove any PC from city -# results$aus_city <- trimws(gsub("\\d", "", results$aus_city)) #alternative -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$aus_city, city_list) -# -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$aus_state, state_list) -# -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$aus_pc, pc_list) -# -# rm(results) -# -# -# -# # CANADA ------------------------------------------------------------------ -# -# ## Canada postal codes also separated -# # sometimes in the next-to-last, sometimes in the next-to-next-to-last (with state) -# -# process_can_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not Canada -# } -# -# if (x[[length(x)]] == "canada") { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# return(c(NA, NA)) # Not canada -# } -# -# # Apply the function across all addresses using `mapply` -# results <- t(mapply(process_can_address, list_address)) -# colnames(results) <- c("can_city", "can_state") -# -# # Clean up results -# results <- as.data.frame(results, stringsAsFactors = FALSE) -# results$can_city[results$can_city == "not canada"] <- NA -# results$can_state[results$can_state == "not canada"] <- NA -# -# # take the PC+state and assign to PC -# results$can_pc<-results$can_state -# -# # any without numbers gets NA'd -# results$can_pc[!grepl("\\d", results$can_pc)] <- NA -# # removes state and removes ltr at start of PC -# results$can_pc<-sub("[a-z]\\s+[a-z]", "", results$can_pc) #+[a-z] -# -# -# # if na in PC, assign the city (some of which have PC) -# results$can_pc <- ifelse(is.na(results$can_pc), results$can_city, results$can_pc) -# results$can_pc[!grepl("\\d", results$can_pc)] <- NA -# -# # keep portions with numbers / remove city names -# # .*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$: This regex pattern matches any -# # characters (.*) followed by a word boundary (\\b) and exactly three word -# # characters (\\w{3}), capturing this as the first group ((\\w{3})). It -# # then matches any characters again (.*) followed by another word boundary -# # and exactly three word characters, capturing this as the second -# # group ((\\w{3})), and ensures this is at the end of the string ($). -# # sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", string): Replaces the -# # entire string with the two captured groups separated by a space. -# results$can_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$can_pc) -# -# # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr -# -# results$can_pc[results$can_pc == ""] <- NA -# # now remove any PC from city -# results$can_city<-sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", results$can_city) # all after first space -# -# -# # fix state -# results$can_state <- trimws(gsub("\\d", "", results$can_state)) -# results$can_state <-trimws(sub(" .*", "", results$can_state)) # all after first space -# results$can_state <- gsub("british", "bc", results$can_state) -# results$can_state <- gsub("nova", "ns", results$can_state) -# # fix city -# results$can_city <- trimws(gsub("\\d", "", results$can_city)) -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$can_city, city_list) -# -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$can_state, state_list) -# -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$can_pc, pc_list) -# -# rm(results) -# -# # INDIA ------------------------------------------------------------------ -# -# # India states are almost always listed but New Delhi is complicated, -# # as are any with only three entries -# # Function to process addresses for both city and state -# process_india_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not India -# } -# -# if (x[[length(x)]] == "india") { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# if (length(x) != 3 && grepl("delhi", x[[length(x) - 1]])) { -# return(c(trimws(x[[length(x) - 1]]), NA)) -# } -# -# return(c(NA, NA)) # Not India -# } -# -# # Apply the function across all addresses using `mapply` -# results <- t(mapply(process_india_address, list_address)) -# colnames(results) <- c("india_city", "india_state") -# -# # Clean up results -# results <- as.data.frame(results, stringsAsFactors = FALSE) -# results$india_city[results$india_city == "not india"] <- NA -# results$india_state[results$india_state == "not india"] <- NA -# -# # Remove numeric parts from state names and trim whitespace -# results$india_state <- trimws(gsub("\\s([0-9]+)", "", results$india_state)) -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$india_city, city_list) -# #### -# -# state_list<-ifelse(is.na(state_list),results$india_state, state_list) -# -# rm(results) -# -# -# # BRAZIL ------------------------------------------------------------------ -# -# -# # Function to process addresses for both city and state -# process_brazil_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not brl -# } -# if (x[[length(x)]] == "brazil") { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# -# return(c(NA, NA)) # Not brl -# } -# -# -# -# # Apply the function across all addresses -# results <- as.data.frame(t(mapply(process_brazil_address, list_address)), -# stringsAsFactors = FALSE) -# colnames(results) <- c("brl_city", "brl_state") -# -# -# -# -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg","andar","empresas", -# "programa", "ciencias", "unidade", "lab ") -# -# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), -# results$brl_state, -# results$brl_city) -# -# -# -# results$brl_state <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_state), -# results$brl_city, -# results$brl_state) -# -# -# # Define the function -# extract_brl_postcodes <- function(df, source_col, target_col,brl_new_city) { -# # 6 digits -# -# pattern <- "br-[0-9]{5,8}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, brl_new_city] <- df[i, source_col] -# # df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# results$brl_pc<-NA -# results$brl_new_city<-NA -# # df, source_col, target_col,city_col -# results <- extract_brl_postcodes(results, "brl_city","brl_pc", "brl_new_city") -# results <- extract_brl_postcodes(results, "brl_state","brl_pc", "brl_new_city") -# -# -# results$brl_new_city <- ifelse(is.na(results$brl_new_city), -# results$brl_state, -# results$brl_new_city) -# -# -# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, -# results$brl_city, -# results$brl_state) -# -# -# -# -# results$brl_new_city <- ifelse(is.na(results$brl_new_city), -# results$brl_city, -# results$brl_new_city) -# -# -# -# -# results$brl_city<-gsub("br-","",results$brl_city) -# results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) -# results$brl_city<-sub("-","",results$brl_city) -# -# -# results$brl_new_city<-gsub("br-","",results$brl_new_city) -# results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) -# results$brl_new_city<-sub("-","",results$brl_new_city) -# -# -# results$brl_pc<-gsub("br-","",results$brl_pc) -# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) -# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) -# -# results$brl_state<-gsub("br-","",results$brl_state) -# results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) -# results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) -# -# # -# # -# # -# # -# # -# # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) -# # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) -# # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) -# # # -# # results$brl_city<-gsub("-","",results$brl_city) -# # -# # results$brl_state<-gsub("br-","",results$brl_state) -# # results$brl_state<-gsub("-","",results$brl_state) -# -# -# -# -# -# # any without numbers gets NA'd -# results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA -# -# # keep portions with numbers / remove city names -# -# results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) -# results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) -# # -# # # Specific replacements -# # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), -# # "rio de janeiro", results$brl_city) -# -# results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), -# "rio de janeiro", results$brl_city) -# results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), -# "rj", results$brl_state) -# results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), -# "sp", results$brl_state) -# -# -# -# results$brl_city[results$brl_city==results$brl_state]<-NA -# -# -# # Clean up and adjust columns -# results[] <- lapply(results, trimws) -# -# -# # Define city-to-state mapping -# city_state_mapping <- data.frame( -# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), -# state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), -# stringsAsFactors = FALSE -# ) -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$city[i], results$brl_city) -# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brl_state) -# } -# -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# -# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brl_state) -# } -# -# -# -# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, -# results$brl_city, -# results$brl_state) -# -# -# results$brl_city <- trimws(results$brl_city, which = "both") -# results$brl_state <- trimws(results$brl_state, which = "both") -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg", -# "programa", "ciencias", "unidade", "lab ") -# -# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), -# results$brl_state, -# results$brl_city) -# -# -# # Final trimming -# results[] <- lapply(results, trimws) -# -# results$brl_city <- ifelse(results$brl_new_city==results$brl_city, -# NA, -# results$brl_city) -# -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$brl_city, city_list) -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$brl_state, state_list) -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$brl_pc, pc_list) -# -# -# rm(results,city_state_mapping) -# -# -# -# # Handle postal codes (BR-[0-9]) -# results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_state), results$brazil_state, NA) -# results$brazil_pc <- ifelse(grepl("br-[0-9]", results$brazil_city) & is.na(results$brazil_pc), -# results$brazil_city, results$brazil_pc) -# -# # Remove BR codes from city and state -# results$brazil_city <- gsub("br-[0-9]+", "", results$brazil_city) -# results$brazil_state <- gsub("br-[0-9]+", "", results$brazil_state) -# -# # Define city-to-state mapping -# city_state_mapping <- data.frame( -# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "sao paulo"), -# state = c("sp", "sp", "sp", "sp", "rj", "rj", "sp"), -# stringsAsFactors = FALSE -# ) -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# results$brazil_city <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), -# city_state_mapping$city[i], results$brazil_city) -# results$brazil_state <- ifelse(grepl(city_state_mapping$city[i], results$brazil_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brazil_state) -# } -# -# # Specific replacements -# results$brazil_city <- ifelse(grepl("museu nacl", results$brazil_city), -# "rio de janeiro", results$brazil_city) -# results$brazil_state <- ifelse(grepl("rio de janeiro", results$brazil_state, ignore.case = TRUE), -# "rj", results$brazil_state) -# results$brazil_state <- ifelse(grepl("sao paulo", results$brazil_state, ignore.case = TRUE), -# "sp", results$brazil_state) -# -# # cleanup -# results$brazil_city[results$brazil_city==results$brazil_state]<-NA -# results$brazil_city <- trimws(results$brazil_city, which = "both") -# results$brazil_state <- trimws(results$brazil_state, which = "both") -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg", -# "programa", "ciencias", "unidade", "lab ") -# -# results$brazil_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brazil_city), -# results$brazil_state, -# results$brazil_city) -# -# # Clean postal codes -# results$brazil_pc <- gsub("[A-Za-z-]", "", results$brazil_pc) -# -# # Final trimming -# results[] <- lapply(results, trimws) -# -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$brazil_city, city_list) -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$brazil_state, state_list) -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$brazil_pc, pc_list) -# -# -# rm(results,city_state_mapping) -# -# -# # CHINA ------------------------------------------------------------------- -# chn_extract <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) -# } else if (x[[length(x)]] == "china") { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) -# } else { -# return(c(NA, NA)) -# } -# } -# -# chn_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) -# names(chn_pc) <- c("chn_city", "chn_state") -# -# # Define the function -# extract_china_postcodes <- function(df, source_col, target_col,chn_new_city) { -# # 6 digits -# pattern <- "[0-9]{6}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, chn_new_city] <- df[i, source_col] -# # df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# chn_pc$chn_pc<-NA -# chn_pc$chn_new_city<-NA -# # df, source_col, target_col,city_col -# chn_pc <- extract_china_postcodes(chn_pc, "chn_city","chn_pc", "chn_new_city") -# chn_pc <- extract_china_postcodes(chn_pc, "chn_state","chn_pc", "chn_new_city") -# -# -# -# # any without numbers gets NA'd -# chn_pc$chn_pc[!grepl("\\d", chn_pc$chn_pc)] <- NA -# -# # keep portions with numbers / remove city names -# chn_pc$chn_new_city<-sub("[0-9]{6}","",chn_pc$chn_new_city) -# chn_pc$chn_city<-sub("[0-9]{6}","",chn_pc$chn_city) -# chn_pc$chn_state<-sub("[0-9]{6}","",chn_pc$chn_state) -# -# -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# -# to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", -# "dept", "div", "univ", "hosp", "coll", "sci", "rd", -# "program","minist", "educ", "sch ", "grad ", "fac ", -# "assoc","forest") -# -# # Define the function -# -# -# # Print the resulting dataframe -# print(a_df) -# -# -# chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")]<-lapply(chn_pc[, c("chn_city","chn_state", "chn_pc", "chn_new_city")], trimws) -# -# -# clean_column <- function(column, delete_terms) { -# column <- gsub(paste(delete_terms, collapse = "|"), NA, column) -# column <- gsub("[0-9]", "", column) # Remove digits -# trimws(column) # Remove leading/trailing whitespace -# } -# -# -# # Clean chn_pc1 and chn_pc2 -# chn_pc$chn_city <- clean_column(chn_pc$chn_city, to_delete) -# chn_pc$chn_new_city <- clean_column(chn_pc$chn_new_city, to_delete) -# chn_pc$chn_state <- clean_column(chn_pc$chn_state, to_delete) -# -# -# chn_pc$chn_new_city <- ifelse(is.na(chn_pc$chn_new_city), -# chn_pc$chn_state, -# chn_pc$chn_new_city) -# -# -# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), -# NA,chn_pc$chn_state) -# -# -# -# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_city)==FALSE), -# chn_pc$chn_city,chn_pc$chn_state) -# -# -# -# chn_pc$chn_state <- ifelse(((chn_pc$chn_new_city==chn_pc$chn_state)==TRUE), -# NA,chn_pc$chn_state) -# -# chn_pc$chn_state <- gsub(" province", "",chn_pc$chn_state) -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), chn_pc$chn_new_city, city_list) -# -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),chn_pc$chn_state, state_list) -# -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),chn_pc$chn_pc, pc_list) -# -# rm(chn_pc) -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # ### China has some where the postal code is with the city, so fix those here -# # # Extract postal code information from the list -# # chn_extract <- function(x) { -# # if (length(x) == 1) { -# # return(c(NA, NA)) -# # } else if (x[[length(x)]] == "china") { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 2]]))) -# # } else { -# # return(c(NA, NA)) -# # } -# # } -# -# # Apply extraction to list_address -# # chn_missing_pc <- data.frame(do.call(rbind, lapply(list_address, chn_extract))) -# # names(chn_missing_pc) <- c("chn_pc1", "chn_pc2") -# # -# # # Define words indicating this is actually a dept not state or postal code -# # # will use this list to delete the ones that don't apply -# # to_delete <- c("&", "inst", "ctr", "med", "chem", "lab", "biol", -# # "dept", "div", "univ", "hosp", "coll", "sci", "rd","program" -# # "minist", "educ", "sch ", "grad ", "fac ","assoc") -# # -# # -# # -# # # Extract numeric postal codes -# # chn_missing_pc$chn_pc_from1 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc1)) -# # chn_missing_pc$chn_pc_from2 <- as.numeric(gsub("[A-Za-z]", "", chn_missing_pc$chn_pc2)) -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # -# # clean_column <- function(column, delete_terms) { -# # column <- gsub(paste(delete_terms, collapse = "|"), NA, column) -# # column <- gsub("[0-9]", "", column) # Remove digits -# # trimws(column) # Remove leading/trailing whitespace -# # } -# # -# # # Clean chn_pc1 and chn_pc2 -# # chn_missing_pc$chn_pc1 <- clean_column(chn_missing_pc$chn_pc1, to_delete) -# # chn_missing_pc$chn_pc2 <- clean_column(chn_missing_pc$chn_pc2, to_delete) -# # -# # # Initialize empty columns for final outputs -# # chn_missing_pc$chn_pc <- NA -# # chn_missing_pc$chn_city <- NA -# # chn_missing_pc$chn_state <- NA -# # -# # # Assign postal codes, cities, and states based on conditions -# # assign_chn_data <- function(from1, from2, pc1, pc2) { -# # list( -# # chn_pc = ifelse(is.na(from1) & !is.na(from2), from2, from1), -# # chn_city = ifelse(is.na(from1) & !is.na(from2), pc2, pc1), -# # chn_state = ifelse(is.na(from1) & !is.na(from2), pc1, pc2) -# # ) -# # } -# # -# # chn_result <- assign_chn_data(chn_missing_pc$chn_pc_from1, -# # chn_missing_pc$chn_pc_from2, -# # chn_missing_pc$chn_pc1, -# # chn_missing_pc$chn_pc2) -# # -# # chn_missing_pc$chn_pc <- chn_result$chn_pc -# # chn_missing_pc$chn_city <- gsub("[0-9]", "", chn_result$chn_city) -# # chn_missing_pc$chn_state <- gsub("[0-9]", "", chn_result$chn_state) -# -# # # Define Chinese states and cities -# # chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", -# # "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", -# # "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", -# # "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", -# # "gansu", "inner mongolia", "jilin", "hainan", "ningxia", -# # "qinghai", "tibet", "macao") -# # -# # # All the cities in the addresses, add as needed. -# # chn_cities <- unique(c(chn_missing_pc$chn_city, "lhasa")) -# -# # Update states and cities based on matching conditions -# # chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc1 %in% chn_states, -# # chn_missing_pc$chn_pc1, chn_missing_pc$chn_state) -# # chn_missing_pc$chn_state <- ifelse(is.na(chn_missing_pc$chn_state) & chn_missing_pc$chn_pc2 %in% chn_states, -# # chn_missing_pc$chn_pc2, chn_missing_pc$chn_state) -# # -# # chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc1 %in% chn_states), -# # chn_missing_pc$chn_pc1, chn_missing_pc$chn_city) -# # chn_missing_pc$chn_city <- ifelse(is.na(chn_missing_pc$chn_city) & !(chn_missing_pc$chn_pc2 %in% chn_states), -# # chn_missing_pc$chn_pc2, chn_missing_pc$chn_city) -# # -# # # put the postal codes and cities in the pc_list, state_list -# # pc_list<-ifelse(is.na(pc_list),chn_missing_pc$chn_pc, pc_list) -# # city_list<-ifelse(is.na(city_list),chn_missing_pc$chn_city, city_list) -# # state_list<-ifelse(is.na(state_list),chn_missing_pc$chn_state, state_list) -# # -# # -# # rm(chn_cities,chn_states,to_delete,chn_result,chn_missing_pc) -# -# -# -# -# -# -# -# -# -# # UK ------------------------------------------------------------------ -# -# process_uk_address <- function(x) { -# if (length(x) == 1) { -# return(c(NA, NA)) # Not uk -# } -# -# if ((x[[length(x)]] == "england")| -# (x[[length(x)]] == "scotland")| -# (x[[length(x)]] == "wales")| -# (x[[length(x)]] == "northern ireland")) { -# if (length(x) == 3) { -# return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x) - 1]]))) -# } else { -# return(c(trimws(x[[length(x) - 2]]), trimws(x[[length(x) - 1]]))) -# } -# } -# -# return(c(NA, NA)) # Not uk -# } -# -# # Apply the function across all addresses using `mapply` -# results <- t(mapply(process_uk_address, list_address)) -# colnames(results) <- c("uk_city", "uk_state") -# -# -# -# -# # Clean up results -# results <- as.data.frame(results, stringsAsFactors = FALSE) -# results$uk_city[results$uk_city == "not uk"] <- NA -# results$uk_state[results$uk_state == "not uk"] <- NA -# -# -# -# results$uk_pc<-NA -# -# # Define the function -# extract_uk_postcodes <- function(df, source_col, target_col,city_col) { -# # Regular expression pattern for UK postal codes -# # One or two initial letters. -# # One or two digits (and possibly a letter). -# # A mandatory space. -# # One digit followed by two letters. -# pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" -# pattern2 <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" -# pattern3 <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" -# -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# # Example usage -# -# results <- extract_uk_postcodes(results, "uk_city","uk_pc", "uk_city") -# results <- extract_uk_postcodes(results, "uk_state","uk_pc", "uk_city") -# -# # any without numbers gets NA'd -# results$uk_pc[!grepl("\\d", results$uk_pc)] <- NA -# -# -# -# results$new_city<-NA -# -# -# # Define the function -# uk_city_id <- function(df, source_col, target_col) { -# # Regular expression pattern for UK postal codes -# # One or two initial letters. -# # One or two digits (and possibly a letter). -# # A mandatory space. -# # One digit followed by two letters. -# pattern <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- df[i, source_col] -# } -# } -# return(df) -# } -# -# # Example usage -# -# results <- uk_city_id(results, "uk_city","new_city") -# results <- uk_city_id(results, "uk_state","new_city") -# -# -# results$new_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$new_city) -# -# -# -# -# # Define the function -# uk_city_id <- function(df, source_col, target_col) { -# # Regular expression pattern for UK postal codes -# # One or two initial letters. -# # One or two digits (and possibly a letter). -# # A mandatory space. -# # One digit followed by two letters. -# pattern <- "\\s[A-Za-z0-9]{2,4}\\s[A-Za-z0-9]{2,4}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- df[i, source_col] -# } -# } -# return(df) -# } -# -# # Example usage -# results <- uk_city_id(results, "uk_state","new_city") -# -# -# -# results$uk_state<-ifelse(results$uk_state==results$uk_city, -# "",results$uk_state) -# -# results$new_city<-ifelse(is.na(results$new_city), -# results$uk_city, -# results$new_city) -# # remove zip codes from new city -# results$new_city<-gsub("\\s[A-Za-z0-9]{3}\\s[A-Za-z0-9]{3}","",results$new_city) -# results$new_city<-gsub("\\s[A-Za-z0-9]{4}\\s[A-Za-z0-9]{2,3}","",results$new_city) -# -# -# results$new_city<-ifelse(results$uk_state=="london", -# "london", -# results$new_city) -# -# -# results$uk_state<-ifelse(results$uk_state=="london", -# NA, -# results$uk_state) -# -# -# -# -# -# -# -# -# -# -# # keep portions with numbers / remove city names -# # results$uk_city<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_city) -# # results$uk_state<-sub("[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}","",results$uk_state) -# # results$uk_pc<-sub(".*\\b(\\w{3})\\b.*\\b(\\w{3})\\b$", "\\1 \\2", results$uk_pc) -# -# # results$can_pc<-gsub("\\b[^\\d\\s]+\\b", "", results$can_pc) #+[a-z] removes 2nd ltr -# -# results$uk_pc[results$uk_pc == ""] <- NA -# # now remove any PC from city -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$new_city, city_list) -# -# # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$uk_state, state_list) -# -# # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$uk_pc, pc_list) -# -# rm(results) -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# # Extracts postal code when combined with city name ----------------------- -# -# city_list <- trimws(city_list, which = "both") -# -# -# -# city_clean<-data.frame( -# authorID=ID, -# addresses=addresses, -# original_city=city_list, -# city_list=city_list, -# state_list=state_list, -# country_list=country_list, -# extract_pc=pc_list) -# -# -# # # England, Scotland, Wales --------------------------------------------- -# # -# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc)& (city_clean$country_list=="england"| -# # city_clean$country_list=="wales" | -# # city_clean$country_list=="scotland"), -# # gsub(".*\\s([a-z0-9]+\\s[a-z0-9]+)$", "\\1", city_clean$city_list), -# # city_clean$extract_pc) -# # -# # # This then deletes the postal code from the city name -# # city_clean$city_list<-ifelse((city_clean$country_list=="england"| -# # city_clean$country_list=="wales" | -# # city_clean$country_list=="scotland"), -# # (gsub("([A-Za-z0-9]+\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), -# # city_clean$city_list) -# # -# city_clean[city_clean == ""] <- NA -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- "\\b[A-Za-z]{1,2}[-][0-9]{4,8}\\b" -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# } -# } -# return(df) -# } -# -# # Example usage -# -# city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") -# -# -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- " [0-9]{3,9}" -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# } -# } -# return(df) -# } -# -# -# # Example usage -# -# city_clean <- extract_postcodes(city_clean, "city_list","extract_pc") -# -# -# -# # Define the function -# delete_matching_text <- function(df, col_a, col_b) { -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Check if the value in column A is not NA and is found within the text in column B -# if(!is.na(df[i, col_a]) && grepl(df[i, col_a], df[i, col_b])) { -# # Remove the matching text from column B by replacing it with an empty string -# df[i, col_b] <- gsub(df[i, col_a], "", df[i, col_b]) -# } -# } -# return(df) -# } -# -# city_clean <- delete_matching_text(city_clean, "extract_pc","city_list") -# -# city_clean <- delete_matching_text(city_clean, "extract_pc","state_list") -# -# -# # remove state if same as city -# -# city_clean$state_list<-ifelse(city_clean$state_list==city_clean$city_list,NA, city_clean$state_list) -# -# -# # there are some usa ones that had state zip buyt no country -# -# -# # Define the function -# extract_postcodes <- function(df, country_list, extract_pc, state_list) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- "\\b[A-Za-z]{2} [0-9]{5}\\b" -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, country_list]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, country_list], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, extract_pc])) { -# df[i, extract_pc] <- extracted_codes[[1]][1] -# df[i, state_list] <- extracted_codes[[1]][1] -# df[i, country_list] <- "usa" -# } -# } -# return(df) -# } -# -# -# # Example usage -# -# city_clean <- extract_postcodes(city_clean, -# "country_list", "extract_pc", "state_list") -# -# -# # remove zip codes from states -# city_clean$state_list<-ifelse(city_clean$country_list=="usa", -# gsub("[0-9]","",city_clean$state_list), -# city_clean$state_list) -# -# # remove state from zipcode -# city_clean$extract_pc<-ifelse(city_clean$country_list=="usa", -# gsub("[a-z]","",city_clean$extract_pc), -# city_clean$extract_pc) -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# city_clean$extract_pc <- trimws(gsub("&", "", city_clean$extract_pc)) -# city_clean[city_clean == ""] <- NA -# -# # -# # -# # -# # -# # -# head(city_clean) -# # -# # # Cleaning up city names with zip codes in them -# # # take the zip code and put it in a new column before deleting -# # -# # -# # -# # -# # # 2) Countries with postal code AFTER city -------------------------------- -# # -# # -# # city_zip<-c("ireland","qatar","kazakhstan","peru","turkey","south korea", -# # "japan","costa rica","mexico","new zealand","iran","thailand", -# # "russia","spain","india","singapore","indonesia","chile", -# # "finland","colombia","taiwan","saudi arabia","uruguay", -# # "slovenia","spain") -# # -# # -# # -# # city_clean$extract_pc<-ifelse((is.na(city_clean$extract_pc)& (city_clean$country_list %in% city_zip)), -# # (gsub(".*[A-Za-z]+", "", city_clean$city_list)), -# # city_clean$extract_pc) -# # # -# # city_clean$city_list<-ifelse((city_clean$country_list %in% city_zip), -# # gsub("\\s([0-9]+)", "", city_clean$city_list), -# # city_clean$city_list) -# # -# # city_clean[city_clean == ""] <- NA -# # -# # -# # # 3) Postal code and dash BEFORE city name -------------------------------- -# # -# # -# # zip_dash<-c("finland","slovakia","austria","portugal","belgium", -# # "spain","israel","czech republic","argentina","france", -# # "sweden","switzerland","turkey","germany","italy", -# # "lithuania","hungary","denmark","poland","norway", "iceland", -# # "greece", "ukraine","estonia","latvia","luxembourg","lativa", -# # "south africa","bulgaria","brazil") -# # -# # -# # -# # -# # city_clean$extract_pc <- ifelse((is.na(city_clean$extract_pc) & -# # (city_clean$country_list %in% zip_dash)), -# # # gsub("\\s([A-Za-z]+)", "", city_clean$city_list), -# # sub(" .*", "", city_clean$city_list), -# # city_clean$extract_pc) -# # -# # -# # city_clean$city_list<-ifelse((city_clean$country_list %in% zip_dash), -# # gsub("[a-zA-Z]+-[0-9]+", "", city_clean$city_list), -# # city_clean$city_list) -# # -# # -# # city_clean[city_clean == ""] <- NA -# # -# # # 4) Netherlands Postal Code ---------------------------------------------- -# # -# # # Netherlands has Postal Code before -# # # it is a combination of 2-3 blocks of letters and numbers -# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="netherlands"), -# # (gsub("^(([^ ]+ ){2}).*", "\\1", city_clean$city_list)), -# # city_clean$extract_pc) -# # city_clean$city_list<-ifelse((city_clean$country_list=="netherlands"), -# # (gsub(".*\\s", "", city_clean$city_list)), -# # city_clean$city_list) -# # city_clean[city_clean == ""] <- NA -# # -# # # 5) Venezuela ----------------------------------------------------------- -# # -# # # Venezuela has postal code after, it is combo of letters and numbers -# # city_clean$extract_pc<-ifelse(is.na(city_clean$extract_pc) & (city_clean$country_list=="venezuela"), -# # gsub(".*\\s([a-z0-9]+)$", "\\1", city_clean$city_list), -# # city_clean$extract_pc) -# # -# # # This then deletes the postal code from the city name -# # city_clean$city_list<-ifelse(city_clean$country_list=="venezuela", -# # (gsub("(\\s[A-Za-z0-9]+)$", "", city_clean$city_list)), -# # city_clean$city_list) -# # city_clean[city_clean == ""] <- NA -# # -# # -# # -# # # trim ws -# # city_clean$extract_pc <- trimws(city_clean$extract_pc, which = "both") -# # city_clean$city_list <- trimws(city_clean$city_list, which = "both") -# # # This removes any that don't have numbers in them -# # city_clean$extract_pc[!grepl("[0-9]", city_clean$extract_pc)] <- NA -# # city_clean$city_list <- gsub("[0-9]+", "", city_clean$city_list) -# # -# # -# # Final Clean Up ---------------------------------------------------------- -# -# # Russia -# city_clean$city_list<-ifelse(city_clean$country_list=="russia", -# (gsub("st Petersburg p", "st petersburg", city_clean$city_list)), -# city_clean$city_list) -# -# -# -# # India -# India_delete<- c("dept") -# -# city_clean$city_list <- ifelse(city_clean$country_list=="india", -# gsub(paste(India_delete, collapse = "|"), NA, city_clean$city_list), -# city_clean$city_list) -# city_clean$city_list<-trimws(city_clean$city_list) # Remove leading/trailing whitespace -# -# # brazil -# city_clean$extract_pc<-ifelse((city_clean$country_list=="brazil" & nchar(city_clean$extract_pc) < 5), -# "",city_clean$extract_pc) -# city_clean[city_clean == ""] <- NA -# -# brazil_delete<- c("barretos canc hosp","univ fed sao paulo", -# "escola filosofia letras & ciencias humanas", -# "hosp sirio libanes","perola byington hosp", -# "sp","univ fed sao paulo","nacl","pesquisa", "museu","dept", -# "lab","zoologia", "inst", "programa","ppg", "ppg") -# -# -# city_clean$city_list <- ifelse(city_clean$country_list=="brazil", -# gsub(paste(brazil_delete, collapse = "|"), NA, city_clean$city_list), -# city_clean$city_list) -# -# city_clean$city_list<-trimws(city_clean$city_list, which="both") # Remove leading/trailing whitespace -# city_clean$state_list<-trimws(city_clean$state_list, which="both") # Remove leading/trailing whitespace -# city_clean$country_list<-trimws(city_clean$country_list, which="both") # Remove leading/trailing whitespace) # Remove leading/trailing whitespace -# # City Abbreviations -# -# city_clean$city_list<-ifelse(city_clean$city_list=="university pk", -# "university park", -# city_clean$city_list) -# -# -# -# city_clean$city_list<-ifelse(city_clean$city_list=="college stn", -# "college station", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse(city_clean$city_list=="n chicago", -# "north chicago", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse(city_clean$city_list=="college pk", -# "college park", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse(city_clean$city_list=="research triangle pk", -# "research triangle park", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse(city_clean$city_list=="state coll", -# "state college", -# city_clean$city_list) -# -# # city corrections -# city_clean$city_list<-ifelse((city_clean$city_list=="dehra dun" & city_clean$country_list == "india"), -# "dehradun", -# city_clean$city_list) -# -# city_clean$city_list<-ifelse((city_clean$city_list=="st john" & city_clean$country_list == "canada"), -# "st. john's", -# city_clean$city_list) -# -# city_clean$state_list<-ifelse(city_clean$state_list=="london ", -# "london", -# city_clean$state_list) -# -# city_clean$city_list<-ifelse((city_clean$state_list=="london" & city_clean$country_list == "england"), -# "london", -# city_clean$city_list) -# -# -# city_clean$state_list<-ifelse(city_clean$state_list=="london", -# NA, -# city_clean$state_list) -# -# -# -# city_list<-city_clean$city_list -# state_list<-city_clean$state_list -# pc_list<-city_clean$extract_pc -# country_list<-city_clean$country_list -# -# -# rm(brazil_delete,India_delete) -# -# -# # rm(city_clean) -# -# -# pc_list[pc_list == ""] <- NA -# city_list[city_list == ""] <- NA -# state_list[state_list == ""] <- NA -# dept_list[dept_list == ""] <- NA -# country_list[country_list == ""] <- NA -# # Create the df that will be returned -# cleaned_ad<-data.frame(ID, -# addresses, -# university_list, -# dept_list, -# city_list, -# country_list, -# state_list, -# pc_list) -# -# -# -# # names(cleaned_ad) -# -# -# list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) -# -# # Because formats of address printing is different across platforms -# # We are going to split using a tier system assuming first and last -# # info is somewhat reliable and guess the other info from the -# # remaining position of the info -# -# second_tier_list <- lapply(list_address1, function(x) x[length(x)]) -# second_tier_list <- trimws(second_tier_list, which = "both") -# second_tier_list[second_tier_list == "character(0)"] <- NA -# -# list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) -# -# third_tier_list <- lapply(list_address2, function(x) x[length(x)]) -# third_tier_list <- trimws(third_tier_list, which = "both") -# third_tier_list[third_tier_list == "character(0)"] <- NA -# -# # All remaining info is just shoved in this category -# remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) -# remain_list <- trimws(remain_list, which = "both") -# remain_list[remain_list == "character(0)"] <- NA -# -# -# # original -# # a_df <- data.frame( -# # adID = ID, university = university_list, -# # country = country_list, -# # state = state_list, postal_code = pc_list, city = NA, -# # department = NA, second_tier = second_tier_list, -# # third_tier = third_tier_list, -# # remain = remain_list, address = addresses, -# # stringsAsFactors = FALSE -# # ) -# -# # EB EDIT -# a_df_1 <- data.frame( -# adID = ID, -# university_1 = university_list, -# university = university_list, -# country_1 = country_list, -# country = country_list, -# state_1 = state_list, -# state = state_list, -# postal_code_1 = pc_list, -# postal_code = pc_list, -# city_1 = city_list, -# city = city_list, -# department_1 = dept_list, -# department = dept_list, -# second_tier = second_tier_list, -# third_tier = third_tier_list, -# remain = remain_list, -# address = addresses, -# stringsAsFactors = FALSE -# ) -# -# a_df_1$city_list<-ifelse(is.na(a_df_1$city_1), -# a_df_1$city, -# a_df_1$city_1) -# -# a_df_1$postal_code_1<-ifelse(is.na(a_df_1$postal_code_1), -# a_df_1$postal_code, -# a_df_1$postal_code_1) -# -# -# # Quebec Postal Code is PQ (Fr) or QC (En) but only QC is georeferenced -# a_df_1$state_1<-ifelse(a_df_1$state_1=="pq" & a_df_1$country_1 == "canada", -# "qc", -# a_df_1$state_1) -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# a_df<-a_df_1 -# -# rm(a_df_1) -# -# -# -# # try to fix the usa spots, which vary in format than other countries -# a_df$state<-ifelse(is.na(a_df$state),"",a_df$state) # added by eb to deal with index problem -# a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0] -# a_df$state[nchar(a_df$state) == 0] <- NA -# a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA -# a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) & -# !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) & -# !is.na(a_df$postal_code) & !is.na(a_df$state)] -# # fix a US problem when usa is not tacked onto the end -# -# us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}" -# a_df$state[ grepl(us_reg, a_df$country) ] <- -# substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2) -# -# a_df$postal_code[ grepl(us_reg, a_df$country) ] <- -# substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8) -# -# a_df$country[grepl(us_reg, a_df$country)] <- "usa" -# -# -# a_df$state_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", -# a_df$state, -# a_df$state_1) -# -# a_df$postal_code_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", -# a_df$postal_code, -# a_df$postal_code_1) -# -# -# a_df$country_1 <- ifelse(a_df$country=="usa" & a_df$country_1!="usa", -# a_df$country, -# a_df$country_1) -# -# -# ########################## -# # We'll use regular expression to pull zipcodes -# # These formats differ by region -# int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}" -# int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]", -# "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="") -# int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}" -# int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" -# int <- paste(int1, int2, int3, int4, sep = "|") -# -# uk <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]", -# "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="") -# -# mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well -# -# panama <- "[[:digit:]]{4}-[[:digit:]]{5}" -# -# zip_search <- paste0(int, "|", uk, "|", mexico, "|", panama) -# -# -# -# -# -# # ADDRD EB INSTEAD OF -# a_df$city_1 <- ifelse(is.na(a_df$city_1),a_df$third_tier,a_df$city_1) -# a_df$state_1 <- ifelse(is.na(a_df$state_1),a_df$second_tier,a_df$state_1) -# a_df$postal_code_1 <- ifelse(is.na(a_df$postal_code_1),a_df$second_tier,a_df$postal_code_1) -# -# -# # fix country - usa -# # Function to remove everything before " usa" -# remove_before_usa <- function(x) { -# if (grepl(" usa", x)) { -# return(sub(".*(?= usa)", "", x, perl = TRUE)) -# } else { -# return(x) -# } -# } -# -# # Apply the function to each element in the vector -# a_df$country_1 <- sapply(a_df$country_1, remove_before_usa) -# a_df$country_1 <- trimws(a_df$country_1, which = "both") -# -# -# a_df$state_1 <- ifelse(a_df$country_1=="usa", -# (trimws(gsub("[0-9]", "", a_df$state_1), which="both")), -# a_df$state_1) -# -# a_df$postal_code_1 <- ifelse(a_df$country_1=="usa", -# (trimws(gsub("[a-z]", "", a_df$postal_code_1), which="both")), -# a_df$postal_code_1) -# -# -# -# ########################### -# id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) & -# a_df$address != "Could not be extracted"] -# ########################### -# -# # We now iteratively run through the addresses using the concept that -# # certain information always exists next to each other. -# # Ex. city, state, country tend to exist next to each other. -# # We use the position of the zipcode also to help guide us -# # in where the information lies as well as how many fields were -# # given to us. -# for (i in id_run) { -# found <- FALSE -# row <- which(a_df$adID == i) -# university <- a_df$university[row] -# second_tier <- a_df$second_tier[row] -# third_tier <- a_df$third_tier[row] -# remain <- a_df$remain[row] -# city <- a_df$city[row] -# state <- a_df$state[row] -# postal_code <- a_df$postal_code[row] -# department <- a_df$department[row] -# grepl(zip_search, second_tier) -# grepl(zip_search, third_tier) -# # 2nd tier -# if (grepl(zip_search, second_tier)) { -# found <- TRUE -# postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier)) -# city <- gsub(zip_search, "", second_tier) -# department <- ifelse(is.na(remain), third_tier, remain) -# } -# # 3RD tiers -# if (grepl(zip_search, third_tier) & !found) { -# found <- TRUE -# postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier)) -# city <- gsub(zip_search, "", third_tier) -# state <- second_tier -# department <- remain -# } -# -# if (!found) { -# state <- second_tier -# city <- third_tier -# department <- remain -# } -# # To make university searching more efficient we'll override values -# # based on if it has university/college in the name, -# # where university overides college -# override_univ <- grepl("\\buniv\\b|\\buniversi", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) & -# !grepl("\\bdrv\\b|\\bdrive\\b", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) -# -# if (any(override_univ)) { -# university <- -# c(second_tier, third_tier, remain, city, university)[override_univ][1] -# assign( -# c("second_tier", "third_tier", "remain", "city", "university")[ -# override_univ][1], -# NA -# ) -# } -# # only if university doesnt already exist -# override_univ_col <- -# grepl("\\bcol\\b|college|\\bcoll\\b", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) & -# !grepl("\\bdrv\\b|\\bdrive\\b", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) -# -# if (!any(override_univ) & any(override_univ_col)) { -# university <- -# c(second_tier, third_tier, remain, city, university )[ -# override_univ_col][1] -# -# assign( -# c("second_tier", "third_tier", "remain", "city", "university")[ -# override_univ_col][1], -# NA -# ) -# } -# # more risky, but institutions as well, just incase its not a university -# override_univ_inst <- grepl("\\binst\\b|\\binstitut", -# c(second_tier, third_tier, remain, city, university), -# ignore.case = TRUE) -# if ( -# !any(override_univ) & !any(override_univ_col) & any(override_univ_inst) -# ) { -# department <- c(second_tier, third_tier, remain, city, university )[ -# override_univ_inst][1] -# -# assign( -# c("second_tier", "third_tier", "remain", "city", "university")[ -# override_univ_inst][1], -# NA -# ) -# } -# -# a_df$city[row] <- gsub("[[:digit:]]", "", city) -# a_df$state[row] <- gsub("[[:digit:]]", "", state) -# a_df$postal_code[row] <- postal_code -# a_df$department[row] <- department -# -# -# -# #########################Clock############################### -# total <- length(id_run) -# pb <- utils::txtProgressBar(min = 0, max = total, style = 3) -# utils::setTxtProgressBar(pb, which(id_run == i)) -# ############################################################# -# } -# -# -# city_fix <- is.na(a_df$city) & !is.na(a_df$state) -# a_df$city[city_fix] <- a_df$state[city_fix] -# a_df$state[city_fix] <- NA -# a_df$university[a_df$university == "Could not be extracted"] <- NA -# a_df$country[a_df$country == "Could not be extracted"] <- NA -# # a_df$country[a_df$country == "peoples r china"] <- "China" -# # a_df$country[a_df$country == "U Arab Emirates"] <- "United Arab Emirates" -# # a_df$country[a_df$country == "Mongol Peo Rep"] <- "Mongolia" -# -# a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <- -# vapply(strsplit( -# a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)], -# "-"), -# function(x) x[2], character(1) -# ) -# #strip periods from the ends of city,state,country -# a_df$city <- gsub("\\.", "", a_df$city) -# a_df$state <- gsub("\\.", "", a_df$state) -# a_df$country <- gsub("\\.", "", a_df$country) -# a_df$country[a_df$country == ""] <- NA -# a_df$university[a_df$university == ""] <- NA -# a_df$postal_code[a_df$postal_code == ""] <- NA -# #convert to lower -# for (l in 2:ncol(a_df)){ -# a_df[, l] <- tolower(a_df[, l]) -# } -# -# # Select columns -# a_df <- a_df[, c("adID", -# "university_1", -# "country_1", -# "state_1", -# "postal_code_1", -# "city_1", -# "department_1", -# "second_tier", -# "third_tier", -# "remain", -# "address") -# ] -# -# # Rename columns -# colnames(a_df) <- c("adID", -# "university", -# "country", -# "state", -# "postal_code", -# "city", -# "department", -# "second_tier", -# "third_tier", -# "remain", -# "address") -# -# -# # sometimes the postal code fails to prse out of state. canm use this -# # when postal code is missing, but then need to remove -# # Function to extract numbers from one column and copy them to another column -# extract_numbers <- function(df, source_col, target_col) { -# if (is.na(target_col)) { -# -# -# df[[target_col]] <- gsub(".*?(\\d+).*", "\\1", df[[source_col]]) -# df[[target_col]][!grepl("\\d", df[[source_col]])] <- NA -# return(df) -# -# } else { -# return(df) -# } -# -# } -# -# # Apply the function to the dataframe -# a_df <- extract_numbers(a_df, "state", "postal_code") -# -# -# -# # ther postal code and city are sometimes in tier3 -# a_df$city <- ifelse(is.na(a_df$city), a_df$third_tier, a_df$city) -# a_df$postal_code <- ifelse(is.na(a_df$postal_code), a_df$third_tier, a_df$postal_code) -# -# -# -# -# -# # Function to remove matching characters from col1 based on col2 -# remove_matching <- function(col1, col2) { -# pattern <- paste0("\\b", gsub("([\\W])", "\\\\\\1", col2), "\\b") -# result <- sub(pattern, "", col1) -# trimws(result) -# } -# -# # Apply the function to each row -# a_df$city <- mapply(remove_matching, a_df$city, a_df$postal_code) -# a_df$state <- mapply(remove_matching, a_df$state, a_df$postal_code) -# -# -# -# library(tidyverse) -# -# -# country<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country) %>% -# mutate(summary=nchar(country)) %>% -# arrange(country) -# -# -# country_city<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,city) -# -# -# -# country_state<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country,state) %>% -# mutate(city_char=nchar(state)) %>% -# arrange(country,state) -# -# country_state_city<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country ,state,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,state,city) -# -# -# country_state_city_pc<- a_df %>% -# # mutate(city_match=(city==second_tier)) %>% -# # filter(city_match==FALSE) %>% -# distinct(country ,state,postal_code,city) %>% -# mutate(city_char=nchar(city)) %>% -# arrange(country,state,postal_code,city) -# -# return(a_df) -# } -# -# -# -# -# -# -# -# -# -# -# #. old -# country_list <- vapply(list_address, function(x) { -# gsub("\\_", "", x[length(x)]) }, -# character(1)) -# country_list <- trimws(country_list, which = "both") -# pc_list <- rep(NA, length(list_address)) -# state_list <- rep(NA, length(list_address)) -# city_list<- rep(NA, length(list_address)) -# country_list <- ifelse(grepl("usa", country_list), "usa", country_list) -# -# list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) -# -# -# -# pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("usa", -# country_list), function(x) x[1], numeric(1))) - 1), which = "right") -# state_list <- pc_list -# -# state_list[nchar(state_list) > 0] <- regmatches( -# state_list[nchar(state_list) > 0], -# regexpr("[[:lower:]]{2}", state_list[nchar(state_list) > 0]) -# ) -# state_list[state_list == ""] <- NA -# -# pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2], -# regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2])) -# pc_list[nchar(pc_list) < 3] <- "" -# pc_list[pc_list == ""] <- NA -# -# -# -# -# city_list <- lapply(list_address1, function(x) x[length(x)]) -# city_list <- trimws(city_list, which = "both") -# city_list[city_list == "character(0)"] <- NA -# -# list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) -# -# dept_list <- lapply(list_address2, function(x) x[length(x)]) -# dept_list <- trimws(dept_list, which = "both") -# dept_list[dept_list == "character(0)"] <- NA -# -# # All remaining info is just shoved in this category -# remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) -# remain_list <- trimws(remain_list, which = "both") -# remain_list[remain_list == "character(0)"] <- NA -# -# -# a_df <- data.frame( -# adID = ID, -# university = university_list, -# country = country_list, -# city = city_list, -# state = state_list, -# postal_code = pc_list, -# department = dept_list, -# remain = remain_list, -# address = addresses, -# stringsAsFactors = FALSE -# ) -# -# -# -# -# # # extracting postal codes - USA ------------------------------------------- -# -# # # USA -------------------------------------------------------------------- -# # -# # process_usa_address <- function(x) { -# # if (length(x) == 1) { -# # return(c(NA, NA)) # Not usa -# # } -# # if (grepl(" usa", x[[length(x)]])) { -# # if (length(x) == 4) { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) -# # } -# # if (length(x) == 5) { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) -# # } -# # if (length(x) == 3) { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[length(x)]))) -# # } else { -# # return(c(trimws(x[[length(x) - 1]]), trimws(x[[length(x)]]))) -# # } -# # } -# # -# # return(c(NA, NA)) # Not usa -# # } -# # -# # -# # # Apply the function across all addresses using `mapply` -# # results <- t(mapply(process_usa_address, list_address)) -# # colnames(results) <- c("usa_city", "usa_state") -# # -# # results<-as.data.frame(results) -# # results$pc<-NA -# # results$country<-NA -# # extract_usa_postcodes <- function(df, usa_state, pc,country) { -# # # 6 digits -# # pattern <- "[0-9]{5}" -# # -# # # Loop through each row of the dataframe -# # for(i in 1:nrow(df)) { -# # # Find all matches of the pattern in the source column -# # matches <- gregexpr(pattern, df[i, usa_state]) -# # # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # # Extract the matches -# # extracted_codes <- regmatches(df[i, usa_state], matches) -# # # If there's at least one match and the target column is NA, copy the first match to the target column -# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, pc])) { -# # df[i, pc] <- extracted_codes[[1]][1] -# # df[i, country] <- "usa" -# # # df[i, city_col] <- df[i, source_col] -# # -# # } -# # } -# # return(df) -# # } -# # -# # -# # results <- extract_usa_postcodes(results, "usa_state", "pc","country") -# # -# # -# # -# # # Update `city_list` if necessary -# # city_list <- ifelse(is.na(city_list), results$usa_city, city_list) -# # # -# # # # Update `state_list` if necessary -# # state_list<-ifelse(is.na(state_list),results$usa_state, state_list) -# # # -# # # # Update `pc_list` if necessary -# # pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) -# # # Update `country_list` if necessary -# # country_list<-ifelse(is.na(country_list),results$country, country_list) -# # -# # -# # -# # # Because formats of address printing is different across platforms -# # # We are going to split using a tier system assuming first and last -# # # info is somewhat reliable and guess the other info from the -# # # remaining position of the info -# # -# # -# # -# # -# # any without numbers gets NA'd -# # results$pc[!grepl("\\d", results$pc)] <- NA -# -# extract_usa_postcodes <- function(df, source, dest1, dest2,dest3) { -# # state and zip -# pattern <- "[a-z]{2} [0-9]{5}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# if (df[i, dest3]=="usa"){ -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# } -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { -# df[i, dest1] <- extracted_codes[[1]][1] -# } -# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { -# # df[i, dest2] <- df[i, source] -# # } -# # if(length(extracted_codes[[1]]) > 0 && is.na(df[i, dest1])) { -# # df[i, dest3] <- "usa" -# # } -# } -# return(df) -# } -# -# -# -# a_df <- extract_usa_postcodes(a_df, "city", "postal_code", "state","country") -# a_df <- extract_usa_postcodes(a_df, "country","postal_code","state","country") -# -# a_df$city<- ifelse(a_df$country=="usa" & ((a_df$city==a_df$state) & (a_df$state==a_df$postal_code)), -# a_df$department,a_df$city) -# -# # keep portions with numbers / remove city names -# a_df$state<-ifelse(a_df$country=="usa", sub(" .*", "",results$usa_state), a_df$state) -# a_df$state<-ifelse(a_df$country=="usa", sub(" usa","",results$usa_state), a_df$state) -# results$usa_state<-sub("[0-9]{5}","",results$usa_state) -# results$usa_state<-sub("usa","",results$usa_state) -# results$usa_state<-trimws(results$usa_state, which = "both") -# -# results$country<-ifelse(is.na(results$usa_city)==FALSE,"usa",results$country) -# -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$usa_city, city_list) -# # -# # # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$usa_state, state_list) -# # -# # # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) -# # Update `country_list` if necessary -# -# country_list<-ifelse((grepl("usa",country_list)),"usa", country_list) -# # remove any with "state_abbrev zip code" but no USA -# country_list <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", country_list, ignore.case = TRUE), "usa", country_list) -# -# -# -# us_state_abbreviations_lower <- c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", -# "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", -# "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", -# "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", -# "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") -# -# country_list<-ifelse(country_list%in%us_state_abbreviations_lower,"usa", country_list) -# -# -# # Update `city_list` if necessary -# city_list <- ifelse(is.na(city_list), results$usa_city, city_list) -# # -# # # Update `state_list` if necessary -# state_list<-ifelse(is.na(state_list),results$usa_state, state_list) -# # -# # # Update `pc_list` if necessary -# pc_list<-ifelse(is.na(pc_list),results$pc, pc_list) -# # Update `country_list` if necessary -# -# rm(results) -# -# -# # a_df$postal_code <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), a_df$country, a_df$postal_code) -# # a_df$state <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), a_df$country, a_df$state) -# # a_df$state <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$state, ignore.case = TRUE), gsub("[0-9]{5}","",a_df$state), a_df$state) -# # a_df$postal_code <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$postal_code, ignore.case = TRUE), gsub("[a-z]{2}","",a_df$postal_code), a_df$postal_code) -# # a_df$country <- ifelse(grepl("[A-Za-z]{2} [0-9]{5}", a_df$country, ignore.case = TRUE), "usa", a_df$country) -# # -# # a_df$state <-trimws(a_df$state,which = "both") -# # a_df$postal_code<- trimws(a_df$postal_code,which = "both") -# # a_df$country <- trimws(a_df$country,which = "both") -# -# -# -# # Postal Codes letters-numbers -------------------------------------------- -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col,sequence) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- sequence -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# } -# } -# return(df) -# } -# -# # usage -# -# sequence<-"\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" -# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) -# sequence<-"\\bbr[-][0-9]{2} [0-9]{5}\\b" -# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) -# sequence<-"\\b[0-9]{5}-[0-9]{3}\\b" -# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) -# -# -# # postal codes - numbers after -------------------------------------------- -# sequence<-"\\b [0-9]{3,8}\\b" -# a_df <- extract_postcodes(a_df, "city","postal_code",sequence) -# -# -# -# # postal codes - uk+ ------------------------------------------------------ -# -# # Define the function -# extract_uk_postcodes <- function(df, source_col, target_col,city_col,sequence) { -# # Regular expression pattern for UK postal codes -# # One or two initial letters. -# # One or two digits (and possibly a letter). -# # A mandatory space. -# # One digit followed by two letters. -# pattern <- sequence -# -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# # Example usage -# sequence <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}" -# # extract_uk_postcodes <- function(df, source_col, target_col,city_col,sequence) -# -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [A-Za-z]{2}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{1,2}[0-9R]{3} [A-Za-z]{3}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{2}[0-9R]{3}[A-Za-z]{2}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{1}[0-9]{3} [0-9]{1}[A-Za-z]{2}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[0-9]{4} [0-9]{2}[A-Za-z]{1}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# sequence <- "[A-Za-z]{2} [A-Za-z]{1} [0-9]{3}[A-Za-z]{2}" -# a_df <- extract_uk_postcodes(a_df, "city","postal_code", "city",sequence) -# a_df <- extract_uk_postcodes(a_df, "department","postal_code", "city",sequence) -# -# -# # postal codes - canada --------------------------------------------------- -# a_df$postal_code <- ifelse(a_df$country=="canada", NA,a_df$postal_code) -# -# a_df$state <- ifelse(a_df$country=="canada" & -# grepl("[A-Za-z]{2}", a_df$city, ignore.case = TRUE), -# a_df$city, a_df$state) -# -# a_df$city <- ifelse(a_df$country=="canada" & -# a_df$city==a_df$state, -# NA, a_df$city) -# -# -# -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col,sequence) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- sequence -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# } -# } -# return(df) -# } -# -# # usage -# -# sequence<-"\\b[A-Za-z]{1}[0-9]{1}[A-Za-z]{1} [0-9]{1}[A-Za-z]{1}[0-9]{1}\\b" -# a_df <- extract_postcodes(a_df, "state","postal_code",sequence) -# -# sequence<-"\\b[A-Za-z]{1}[0-9]{1}[A-Za-z]{1} [0-9]{1}[A-Za-z]{1}[0-9]{1}\\b" -# a_df <- extract_postcodes(a_df, "department","postal_code",sequence) -# -# a_df$state<-ifelse(a_df$country=="canada", sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", a_df$state),a_df$state) # all after first space -# -# -# a_df$city<-ifelse(a_df$country=="canada" & is.na(a_df$city),a_df$department,a_df$city) # all after first space -# -# a_df$city<-ifelse(a_df$country=="canada", sub("\\s([a-z0-9]+\\s[a-z0-9]+)$", "", a_df$city),a_df$city) # all after first space -# -# a_df$state<-ifelse(a_df$country=="canada", gsub("[0-9]", "", a_df$state),a_df$state) -# -# -# -# # postal codes - india ---------------------------------------------------- -# -# a_df$state<-ifelse(a_df$country=="india", a_df$city,a_df$state) -# -# # Define the function -# extract_postcodes <- function(df, source_col, target_col,city_col,sequence) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- sequence -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# df[i, city_col] <- df[i, source_col] -# } -# } -# return(df) -# } -# -# # usage -# -# sequence<-"[0-9]{5,8}" -# a_df <- extract_postcodes(a_df, "department","postal_code","city",sequence) -# -# -# a_df$department<-ifelse(a_df$country=="india" & a_df$city==a_df$department,NA,a_df$department) -# a_df$city<-ifelse(a_df$country=="india" & a_df$city==a_df$state,NA,a_df$city) -# a_df$city<-ifelse(a_df$country=="india" & is.na(a_df$city),a_df$department,a_df$city) -# -# -# # Define the function -# extract_postcodes2 <- function(df, source_col, target_col,city_col,sequence) { -# # One or two initial letters. -# # mandatory dash -# # several numbers -# pattern <- sequence -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0) { -# df[i, city_col] <- df[i, source_col] -# } -# } -# return(df) -# } -# -# # usage -# -# sequence<-"[0-9]{5,8}" -# a_df <- extract_postcodes2(a_df, "postal_code","city","city",sequence) -# -# a_df$city <- ifelse(a_df$country=="india" & (grepl("delhi", a_df$city)|grepl("delhi", a_df$state)), -# "new delhi", a_df$city) -# -# -# -# -# -# # postal codes - australia ------------------------------------------------- -# -# a_df$state<-ifelse(a_df$country=="australia", a_df$city,a_df$state) -# a_df$city<-ifelse(a_df$country=="australia", a_df$department,a_df$city) -# -# a_df$state<-ifelse(a_df$country=="australia", gsub("\\s[0-9]{4,5}", "", a_df$state),a_df$state) -# a_df$city<-ifelse(a_df$country=="australia", gsub("\\s[0-9]{4,5}", "", a_df$city),a_df$city) -# -# -# # Brazil ------------------------------------------------------------------ -# -# -# -# -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg","andar","empresas", -# "programa", "ciencias", "unidade", "lab ") -# -# a_df$department <- ifelse(a_df$country=="brazil" & grepl(paste(to_check, collapse = "|"), a_df$department), -# NA, -# a_df$department) -# -# -# -# -# -# -# # Define the function -# extract_brl_postcodes <- function(df, source_col, target_col) { -# # 6 digits -# -# pattern <- "br-[0-9]{5,8}" -# -# # Loop through each row of the dataframe -# for(i in 1:nrow(df)) { -# # Find all matches of the pattern in the source column -# matches <- gregexpr(pattern, df[i, source_col]) -# # matches <- gregexpr(paste(pattern1,collapse = "|"), df[i, source_col]) -# # Extract the matches -# extracted_codes <- regmatches(df[i, source_col], matches) -# # If there's at least one match and the target column is NA, copy the first match to the target column -# if(length(extracted_codes[[1]]) > 0 && is.na(df[i, target_col])) { -# df[i, target_col] <- extracted_codes[[1]][1] -# # df[i, brl_new_city] <- df[i, source_col] -# # df[i, city_col] <- df[i, source_col] -# -# } -# } -# return(df) -# } -# -# -# # df, source_col, target_col,city_col -# a_df <- extract_brl_postcodes(a_df, "department","postal_code") -# -# -# -# -# -# -# -# -# -# -# -# -# -# results <- extract_brl_postcodes(results, "brl_state","brl_pc") -# -# -# results$brl_new_city <- ifelse(is.na(results$brl_new_city), -# results$brl_state, -# results$brl_new_city) -# -# -# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, -# results$brl_city, -# results$brl_state) -# -# -# -# -# results$brl_new_city <- ifelse(is.na(results$brl_new_city), -# results$brl_city, -# results$brl_new_city) -# -# -# -# -# results$brl_city<-gsub("br-","",results$brl_city) -# results$brl_city<-sub("[0-9]{2,8}","",results$brl_city) -# results$brl_city<-sub("-","",results$brl_city) -# -# -# results$brl_new_city<-gsub("br-","",results$brl_new_city) -# results$brl_new_city<-sub("[0-9]{2,8}","",results$brl_new_city) -# results$brl_new_city<-sub("-","",results$brl_new_city) -# -# -# results$brl_pc<-gsub("br-","",results$brl_pc) -# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) -# results$brl_pc<-sub("[0-9]{2,8}","",results$brl_pc) -# -# results$brl_state<-gsub("br-","",results$brl_state) -# results$brl_state<-sub("[0-9]{2,8} ","",results$brl_state) -# results$brl_state<-sub(" [0-9]{2,8} ","",results$brl_state) -# -# # -# # -# # -# # -# # -# # results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) -# # results$brl_new_city<-gsub("gavea rio de janeiro","rio de janeiro",results$brl_new_city) -# # results$brl_new_city<-gsub("gavea","rio de janeiro",results$brl_new_city) -# # # -# # results$brl_city<-gsub("-","",results$brl_city) -# # -# # results$brl_state<-gsub("br-","",results$brl_state) -# # results$brl_state<-gsub("-","",results$brl_state) -# -# -# -# -# -# # any without numbers gets NA'd -# results$brl_pc[!grepl("\\d", results$brl_pc)] <- NA -# -# # keep portions with numbers / remove city names -# -# results$brl_city<-sub("[0-9]{2,8} ","",results$brl_city) -# results$brl_city<-sub(" [0-9]{2,8}","",results$brl_city) -# # -# # # Specific replacements -# # results$brl_city <- ifelse(grepl("rio de janeiro", results$brl_city), -# # "rio de janeiro", results$brl_city) -# -# results$brl_city <- ifelse(grepl("museu nacl", results$brl_city), -# "rio de janeiro", results$brl_city) -# results$brl_state <- ifelse(grepl("rio de janeiro", results$brl_state, ignore.case = TRUE), -# "rj", results$brl_state) -# results$brl_state <- ifelse(grepl("sao paulo", results$brl_state, ignore.case = TRUE), -# "sp", results$brl_state) -# -# -# -# results$brl_city[results$brl_city==results$brl_state]<-NA -# -# -# # Clean up and adjust columns -# results[] <- lapply(results, trimws) -# -# -# # Define city-to-state mapping -# city_state_mapping <- data.frame( -# city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), -# state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), -# stringsAsFactors = FALSE -# ) -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# results$brl_city <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$city[i], results$brl_city) -# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brl_state) -# } -# -# -# # Match cities and states -# for (i in 1:nrow(city_state_mapping)) { -# -# results$brl_state <- ifelse(grepl(city_state_mapping$city[i], results$brl_state, ignore.case = TRUE), -# city_state_mapping$state[i], results$brl_state) -# } -# -# -# -# results$brl_state <- ifelse(results$brl_new_city==results$brl_state, -# results$brl_city, -# results$brl_state) -# -# -# results$brl_city <- trimws(results$brl_city, which = "both") -# results$brl_state <- trimws(results$brl_state, which = "both") -# # Define words indicating this is actually a dept not state or postal code -# # will use this list to delete the ones that don't apply -# to_check <- c("dept","ctr","inst","ppg", -# "programa", "ciencias", "unidade", "lab ") -# -# results$brl_city <- ifelse(grepl(paste(to_check, collapse = "|"), results$brl_city), -# results$brl_state, -# results$brl_city) -# -# -# # Final trimming -# results[] <- lapply(results, trimws) -# -# results$brl_city <- ifelse(results$brl_new_city==results$brl_city, -# NA, -# results$brl_city) -# From 9f6c3f18ce8b6e0fe85319d5e034b30259c23681 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 10 Mar 2025 11:08:52 -0400 Subject: [PATCH 15/34] added info on new geocoding to readme and news --- NEWS.md | 10 ++++++++++ R/authors_georef.R | 2 +- README.Rmd | 25 +------------------------ 3 files changed, 12 insertions(+), 25 deletions(-) diff --git a/NEWS.md b/NEWS.md index ea7a4d7..2e748af 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,16 @@ # refsplitr News +refsplitr 1.5 (2025-04-15) +========================= + +### NEW FEATURES + + * The new default service for georeferencing author institutions is [`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/). Using [`ggmap`](https://github.com/dkahle/ggmap) to access the Google Maps API is still an option, but users should be aware that this is no longer a free service. + + * The `authors_addresses` function has been updated. + + refsplitr 1.0.2 (2024-08-12) ========================= diff --git a/R/authors_georef.R b/R/authors_georef.R index 2c60f19..666f4bf 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -22,7 +22,7 @@ #' #' @param data dataframe from `authors_refine()` #' @param address_column name of column in quotes where the addresses are -#' #' @param google_api if FALSE georeferencing is carried out with the +#' @param google_api if FALSE georeferencing is carried out with the #' tidygeocoder package (geocode() with method = 'osm'). if TRUE geocoding #' is done with the google maps API. Defaults to FALSE. #' @importFrom ggmap geocode diff --git a/README.Rmd b/README.Rmd index 02a9c4b..0191441 100644 --- a/README.Rmd +++ b/README.Rmd @@ -35,7 +35,7 @@ There are four steps in the `refsplitr` package's workflow: 1. Importing and tidying Web of Science reference records (be sure to download records using the procedure in Appendix 1 of the [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html)) 2. Author name disambiguation and parsing of author addresses -3. Georeferencing of author institutions. (*Important Note*: Google has changed its API requirements, which means users now have register with Google prior to georeferencing. For additional details see the [`ggmap`](https://github.com/dkahle/ggmap) repository and the instructions below.) +3. Georeferencing of author institutions using either [`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/) (default; free) or the Google Maps API (paid; for additional details on how to register with Google prior to georeferencing see the [`ggmap`](https://github.com/dkahle/ggmap) repository and `refsplitr` [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html).) 4. Data visualization The procedures required for these four steps,each of which is implemented with a simple command, are described in detail in the `refsplitr` [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html). An example of this workflow is provided below: @@ -58,29 +58,6 @@ dat4 <- authors_georef(dat3) plot_net_address(dat4$addresses) ``` -## Registering with Google for an API key - -1. Install and load the `ggmap` package - -```{r example2, eval=FALSE} - -install.packages("ggmap") -library(ggmap) - -``` - -1. Register for a Google [Geocoding API](https://developers.google.com/maps/documentation/geocoding/overview) by following the instructions on the `READ ME` of the [`ggmap`](https://github.com/dkahle/ggmap) repository. - -2. Once you have your API key, add it to your `~/.Renviron` with the following: - -```{r example3, eval=FALSE} -`ggmap::register_google(key = "[your key]", write = TRUE)` -``` - -3. You should now be able to use `authors_georef()` as described in the vignette. **WARNING:** `refsplitr` currently has a limit of 2500 API calls per day. We are working on including the ability for users to select their own limits. - -***Remember***: Your API key is unique and for you alone. Don't share it with other users or record it in a script file that is saved in a public repository. If need be you can visit the same website where you initially registered and generate a new key. - ## Improvements & Suggestions We welcome any suggestions for package improvement or ideas for features to include in future versions. We welcome any suggestions for package improvement or ideas for features to include in future versions. If you have suggestions, [here is how to contribute](https://github.com/ropensci/refsplitr/blob/master/CONTRIBUTING.md). We expect everyone contributing to the package to abide by our [Code of Conduct](https://github.com/ropensci/refsplitr/blob/master/CODE_OF_CONDUCT.md). From 66258b98ee622ed89a3ab3c061e9e2c76ac813ab Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 10 Mar 2025 11:17:30 -0400 Subject: [PATCH 16/34] replace old authors_address with new code --- R/authors_address.R | 1304 ++++++++++++++++++++++++++++++------ R/authors_address_update.R | 1133 ------------------------------- 2 files changed, 1089 insertions(+), 1348 deletions(-) delete mode 100644 R/authors_address_update.R diff --git a/R/authors_address.R b/R/authors_address.R index 7c22e5b..bbc68e0 100644 --- a/R/authors_address.R +++ b/R/authors_address.R @@ -1,259 +1,1133 @@ #' Parses out address information and splits it into its respective parts. #' This is an internal function used by \code{authors_clean} -#' -#' \code{authors_address} This function takes the output from +#' +#' \code{authors_address} This function takes the output from #' \code{references_read} and pulls out address information. Splitting it into -#' university, department, city, state, etc. +#' university, department, city, state, etc. #' @param addresses the addresses #' @param ID the authorID #' @noRd -authors_address <- function(addresses, ID){ - message("\nSplitting addresses\n") +authors_address <- function(addresses, ID) { + + addresses <- tolower(addresses) - addresses<-tolower(addresses) # make all lower case + message("\nSplitting addresses\n") list_address <- strsplit(addresses, ",") - - + + + # remove punctuation ---------------------------------------- + ## First remove periods and trim white space from countries. ## helps avoids mistakes later on - + remove_period_from_last <- function(list_address) { lapply(list_address, function(x) { if (length(x) > 0) { x[length(x)] <- gsub("\\.$", "", x[length(x)]) - x[length(x)] <- trimws(x[length(x)], which = "both") + x[length(x)] <- trimws(x[length(x)], which = "both") } return(x) }) } - + list_address <- remove_period_from_last(list_address) - - - university_list <- vapply(list_address, function(x) x[1], character(1)) - country_list <- vapply(list_address, function(x) { - gsub("\\_", "", x[length(x)]) }, - character(1)) - country_list <- trimws(country_list, which = "both") - pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("usa", - country_list), function(x) x[1], numeric(1))) - 1), which = "right") - state_list <- pc_list - state_list[nchar(state_list) > 0] <- regmatches( - state_list[nchar(state_list) > 0], - regexpr("[[:lower:]]{2}", state_list[nchar(state_list) > 0]) + # trim ws ----------------------------------------------------------------- + + list_address <- lapply(list_address, trimws) + + # correct countries ------------------------------------------------------- + + + # correct or update names of some countries to make it possible to georef + + # Define the function + correct_countries <- function(my_list, replacements) { + # Loop through each element of the list + for (i in 1:length(my_list)) { + # Get the length of the current element + len <- length(my_list[[i]]) + + # Check if the last item matches any of the target words + if (len > 0 && my_list[[i]][len] %in% names(replacements)) { + # Replace the last item with the corresponding replacement word + my_list[[i]][len] <- replacements[[my_list[[i]][len]]] + } + } + return(my_list) + } + # NB: also updated country names. + # czechia = new name for czech republic + # united arab rep = current country name depends on city + replacements <- c( + "austl" = "australia", + "c z" = "czechia", + "cz" = "czechia", + "czech republic" = "czechia", + "fed rep ger" = "germany", + "columbia" = "colombia", + "peoples r china" = "china", + "u arab emirates" = "united arab emirates", + "mongol peo rep" = "mongolia", + "dominican rep" = "dominican republic", + "fr polynesia" = "french polynesia", + "neth antilles" = "netherland antilles", + "trinid & tobago" = "trinidad & tobago", + "rep congo" = "congo", + "north ireland" = "northern ireland", + "syrian arab rep" = "syria" ) - pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2], - regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2])) - pc_list[nchar(pc_list) < 3] <- "" - country_list <- ifelse(grepl("usa", country_list), "usa", country_list) + list_address <- correct_countries(list_address, replacements) + + # extract university ------------------------------------------------------ + + university_list <- vapply(list_address, function(x) x[1], character(1)) + + # extract department ------------------------------------------------------ + + # If department is listed it is typically second + # (EB note: only if 4+ slots) + # this will be 2x checked later - list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))]) + dept_extract <- function(x) { + if (length(x) < 4) { + return(NA) + } else { + return(trimws(x[[2]])) + } + } + + dept_list <- unlist(lapply(list_address, dept_extract)) + + dept_list <- trimws(dept_list, which = "both") + + + # Extract City ------------------------------------------------------------ + + # If there is only one element, then it can't have both city and country' + city_list <- vapply(list_address, function(x) { + n <- length(x) + if (n == 1) { + return("no city") # placeholder to replace with NA after function + } + + # In some cases city is next-to-last element, in others next-to-next-to-last + last_element <- x[[n]] + second_last <- if (n > 1) x[[n - 1]] else NA + third_last <- if (n > 2) x[[n - 2]] else NA + + # Default case + return(second_last) + }, character(1)) + + # Cleanup + city_list <- trimws(city_list, which = "both") + city_list[city_list == "no city"] <- NA + + + # extract state ----------------------------------------------------------- + + # If there is only one element, then it can't have both city and country' + state_list <- vapply(list_address, function(x) { + n <- length(x) + if (n == 1) { + return("no state") # placeholder to replace with NA after function + } - # Because formats of address printing is different across platforms - # We are going to split using a tier system assuming first and last - # info is somewhat reliable and guess the other info from the - # remaining position of the info + # In some cases city is next-to-last element, in others next-to-next-to-last + last_element <- x[[n]] + second_last <- if (n > 1) x[[n - 1]] else NA + third_last <- if (n > 2) x[[n - 2]] else NA - second_tier_list <- lapply(list_address1, function(x) x[length(x)]) - second_tier_list <- trimws(second_tier_list, which = "both") - second_tier_list[second_tier_list == "character(0)"] <- NA + # Default case + return(third_last) + }, character(1)) - list_address2 <- lapply(list_address1, function(x) x[-c(length(x))]) + # Cleanup + state_list <- trimws(state_list, which = "both") + state_list[state_list == "no state"] <- NA - third_tier_list <- lapply(list_address2, function(x) x[length(x)]) - third_tier_list <- trimws(third_tier_list, which = "both") - third_tier_list[third_tier_list == "character(0)"] <- NA + # this is used to double check later - sometimes city is extracted as state + city_list2 <- trimws(state_list, which = "both") - # All remaining info is just shoved in this category - remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1]) - remain_list <- trimws(remain_list, which = "both") - remain_list[remain_list == "character(0)"] <- NA + # Extract Country --------------------------------------------------------- + + country_list <- vapply( + list_address, function(x) { + gsub("\\_", "", x[length(x)]) + }, + character(1) + ) + + + # postal code (pc) list --------------------------------------------------- + + # pc often with city + + pc_list <- city_list + + # bind all into df -------------------------------------------------------- a_df <- data.frame( - adID = ID, university = university_list, + adID = ID, + university = university_list, country = country_list, - state = state_list, postal_code = pc_list, city = NA, - department = NA, second_tier = second_tier_list, - third_tier = third_tier_list, - remain = remain_list, address = addresses, + state = state_list, + postal_code = pc_list, + city = city_list, + city2 = city_list2, + department = dept_list, + address = addresses, stringsAsFactors = FALSE ) - # try to fix the USA spots, which vary in format than other countries - a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0] - a_df$state[nchar(a_df$state) == 0] <- NA - a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA - a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) & - !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) & - !is.na(a_df$postal_code) & !is.na(a_df$state)] - # fix a US problem when USA is not tacked onto the end - - us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}" - a_df$state[ grepl(us_reg, a_df$country) ] <- - substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2) - - a_df$postal_code[ grepl(us_reg, a_df$country) ] <- - substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8) - - a_df$country[grepl(us_reg, a_df$country)] <- "usa" - - ########################## - # We'll use regular expression to pull zipcodes - # These formats differ by region - int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}" - int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:lower:]]", - "[[:space:]][[:digit:]][[:lower:]][[:digit:]]", sep="") - int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}" - int4 <- "[:lower:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}" - int <- paste(int1, int2, int3, int4, sep = "|") - - UK <- paste("[[:lower:]]{1,2}[[:digit:]]{1,2}[[:space:]]", - "{1}[[:digit:]]{1}[[:lower:]]{2}", sep="") - - Mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well - - Panama <- "[[:digit:]]{4}-[[:digit:]]{5}" - - zip_search <- paste0(int, "|", UK, "|", Mexico, "|", Panama) - - ########################### - id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) & - a_df$address != "Could not be extracted"] - ########################### - - # We now iteratively run through the addresses using the concept that - # certain information always exists next to each other. - # Ex. city, state, country tend to exist next to each other. - # We use the position of the zipcode also to help guide us - # in where the information lies as well as how many fields were - # given to us. - for (i in id_run) { - found <- FALSE - row <- which(a_df$adID == i) - university <- a_df$university[row] - second_tier <- a_df$second_tier[row] - third_tier <- a_df$third_tier[row] - remain <- a_df$remain[row] - city <- NA - state <- NA - postal_code <- NA - department <- NA - grepl(zip_search, second_tier) - grepl(zip_search, third_tier) - # 2nd tier - if (grepl(zip_search, second_tier)) { - found <- TRUE - postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier)) - city <- gsub(zip_search, "", second_tier) - department <- ifelse(is.na(remain), third_tier, remain) - } - # 3RD tiers - if (grepl(zip_search, third_tier) & !found) { - found <- TRUE - postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier)) - city <- gsub(zip_search, "", third_tier) - state <- second_tier - department <- remain - } - if (!found) { - state <- second_tier - city <- third_tier - department <- remain - } - # To make university searching more efficient we'll override values - # based on if it has university/college in the name, - # where university overides college - override_univ <- grepl("\\buniv\\b|\\buniversi", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) & - !grepl("\\bdrv\\b|\\bdrive\\b", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) - - if (any(override_univ)) { - university <- - c(second_tier, third_tier, remain, city, university)[override_univ][1] - assign( - c("second_tier", "third_tier", "remain", "city", "university")[ - override_univ][1], - NA - ) - } - # only if university doesnt already exist - override_univ_col <- - grepl("\\bcol\\b|college|\\bcoll\\b", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) & - !grepl("\\bdrv\\b|\\bdrive\\b", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) - - if (!any(override_univ) & any(override_univ_col)) { - university <- - c(second_tier, third_tier, remain, city, university )[ - override_univ_col][1] - - assign( - c("second_tier", "third_tier", "remain", "city", "university")[ - override_univ_col][1], - NA - ) - } - # more risky, but institutions as well, just incase its not a university - override_univ_inst <- grepl("\\binst\\b|\\binstitut", - c(second_tier, third_tier, remain, city, university), - ignore.case = TRUE) - if ( - !any(override_univ) & !any(override_univ_col) & any(override_univ_inst) - ) { - department <- c(second_tier, third_tier, remain, city, university )[ - override_univ_inst][1] - - assign( - c("second_tier", "third_tier", "remain", "city", "university")[ - override_univ_inst][1], - NA - ) - } - a_df$city[row] <- gsub("[[:digit:]]", "", city) - a_df$state[row] <- gsub("[[:digit:]]", "", state) - a_df$postal_code[row] <- postal_code - a_df$department[row] <- department + # any PC without numbers gets NA'd + a_df$postal_code[!grepl("\\d", a_df$postal_code)] <- NA + + # copy over PC and state + a_df$state <- ifelse(grepl("usa", a_df$country) & nchar(a_df$state) > 2, + NA, + a_df$state + ) + + + a_df$postal_code <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), + a_df$country, a_df$postal_code + ) + + a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), + a_df$country, a_df$state + ) + + a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5}", a_df$city), + a_df$city, a_df$state + ) + + + a_df$state <- ifelse(grepl("[a-z]{2} usa", a_df$country), + a_df$country, a_df$state + ) + + # remove the numbers and letters as appropriate + + + a_df$country <- ifelse(grepl(" usa", a_df$country), + "usa", a_df$country + ) + + a_df$state <- ifelse(a_df$country == "usa" & grepl("[a-z]{2} [0-9]{5}", a_df$state), + gsub("[[:digit:]]{5}", "", a_df$state), a_df$state + ) + + a_df$state <- ifelse(a_df$country == "usa" & grepl(" usa", a_df$state), + gsub(" usa", "", a_df$state), a_df$state + ) + + + a_df$postal_code <- ifelse(a_df$country == "usa", + gsub( + "[[:alpha:]]{2} ", "", + a_df$postal_code + ), a_df$postal_code + ) + + a_df$postal_code <- ifelse(a_df$country == "usa", + gsub( + " usa", "", + a_df$postal_code + ), a_df$postal_code + ) + + + + + a_df$city <- ifelse(a_df$country == "usa" & grepl("[a-z]{2} [0-9]{5}", a_df$city), + a_df$city2, a_df$city + ) + + + pattern <- "[a-z]{2} [0-9]{5}" + + a_df$postal_code <- ifelse(grepl(pattern, a_df$country), + a_df$country, a_df$postal_code + ) + a_df$state <- ifelse(grepl(pattern, a_df$country), + a_df$country, a_df$state + ) + a_df$country <- ifelse(grepl(pattern, a_df$country), + "usa", a_df$country + ) + a_df$postal_code <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$postal_code), + gsub("[a-z]", "", a_df$postal_code), a_df$postal_code + ) + a_df$state <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$state), + gsub("[0-9]", "", a_df$postal_code), a_df$state + ) + + + + # BRAZIL clean-up --------------------------------------------------------- + + a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2, + a_df$city, a_df$state + ) + a_df$city <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2, + a_df$city2, a_df$city + ) + a_df$city2 <- ifelse(a_df$country == "brazil" & a_df$city == a_df$city2, + NA, a_df$city2 + ) + a_df$postal_code <- ifelse(a_df$country == "brazil" & is.na(a_df$postal_code), + a_df$city, a_df$postal_code + ) + a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$state) > 2, + NA, a_df$state + ) + + a_df$postal_code <- ifelse(a_df$country == "brazil", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "brazil", + gsub("[-]", "", a_df$postal_code), + a_df$postal_code + ) + + a_df$city <- ifelse(a_df$country == "brazil", + gsub("br-", "", a_df$city), + a_df$city + ) + a_df$city <- ifelse(a_df$country == "brazil", + gsub("[0-9]", "", a_df$city), + a_df$city + ) + + + a_df$state <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), + a_df$city, a_df$state + ) + a_df$postal_code <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$city <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), + a_df$city2, a_df$city + ) + + + # repeat the clean of city + a_df$city <- ifelse(a_df$country == "brazil", + gsub("br-", "", a_df$city), + a_df$city + ) + a_df$city <- ifelse(a_df$country == "brazil", + gsub("[0-9]", "", a_df$city), + a_df$city + ) + + + a_df$postal_code <- ifelse(a_df$country == "brazil", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "brazil", + gsub("[-]", "", a_df$postal_code), + a_df$postal_code + ) + + a_df[] <- lapply(a_df, trimws) - #########################Clock############################### - total <- length(id_run) - pb <- utils::txtProgressBar(min = 0, max = total, style = 3) - utils::setTxtProgressBar(pb, which(id_run == i)) - ############################################################# + + # Define city-to-state mapping + city_state_mapping <- data.frame( + city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), + state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), + stringsAsFactors = FALSE + ) + + # Match cities and states + for (i in 1:nrow(city_state_mapping)) { + a_df$city <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), + city_state_mapping$city[i], a_df$city + ) + a_df$state <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), + city_state_mapping$state[i], a_df$state + ) } - city_fix <- is.na(a_df$city) & !is.na(a_df$state) - a_df$city[city_fix] <- a_df$state[city_fix] - a_df$state[city_fix] <- NA - a_df$university[a_df$university == "Could not be extracted"] <- NA - a_df$country[a_df$country == "Could not be extracted"] <- NA - a_df$country[a_df$country == "Peoples R China"] <- "China" - a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <- - vapply(strsplit( - a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)], - "-"), - function(x) x[2], character(1) + # Match cities and states + for (i in 1:nrow(city_state_mapping)) { + a_df$state <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$city, ignore.case = TRUE), + city_state_mapping$state[i], a_df$state ) - #strip periods from the ends of city,state,country - a_df$city <- gsub("\\.", "", a_df$city) - a_df$state <- gsub("\\.", "", a_df$state) - a_df$country <- gsub("\\.", "", a_df$country) - a_df$country[a_df$country == ""] <- NA - a_df$university[a_df$university == ""] <- NA - a_df$postal_code[a_df$postal_code == ""] <- NA - #convert to lower - for (l in 2:ncol(a_df)){ - a_df[, l] <- tolower(a_df[, l]) } + + # AUSTRALIA clean-up--------------------------------------------------------------- + + a_df$state <- ifelse(a_df$country == "australia", + a_df$city, a_df$state + ) + a_df$postal_code <- ifelse(a_df$country == "australia", + a_df$city, a_df$postal_code + ) + a_df$city <- ifelse(a_df$country == "australia", + a_df$city2, a_df$city + ) + a_df$city2 <- ifelse(a_df$country == "australia", + NA, a_df$city2 + ) + + + a_df$postal_code <- ifelse(a_df$country == "australia", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$state <- ifelse(a_df$country == "australia", + gsub("[0-9]", "", a_df$state), + a_df$state + ) + + a_df[] <- lapply(a_df, trimws) + + + + # CANADA clean-up --------------------------------------------------------- + + a_df$state <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2, + a_df$city, a_df$state + ) + + a_df$city <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2, + NA, a_df$city + ) + + a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), + a_df$city2, a_df$postal_code + ) + + a_df$state <- ifelse(a_df$country == "canada" & a_df$city2 == a_df$state, + NA, a_df$state + ) + + a_df$city <- ifelse(a_df$country == "canada", a_df$city2, a_df$city) + a_df$city2 <- ifelse(a_df$country == "canada", NA, a_df$city2) + + + a_df$city <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city), + gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b", "", a_df$city), + a_df$city + ) + + a_df$state <- ifelse(a_df$country == "canada" & is.na(a_df$state), + a_df$postal_code, + a_df$state + ) + + a_df$state <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$state), + gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b", "", a_df$state), + a_df$state + ) + + a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{2,20})\\b \\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$postal_code), + gsub("\\b(\\w{1,2}|\\w{4,})\\b", "", a_df$postal_code), + a_df$postal_code + ) + + a_df[] <- lapply(a_df, trimws) + + # TODO: a few postal codes still have letters from city + + + a_df$postal_code <- ifelse(a_df$country == "canada", gsub(" ", "", a_df$postal_code), a_df$postal_code) + + + # UK clean-up ------------------------------------------------------------- + + uk <- c("scotland", "england", "wales", "northern ireland") + pattern <- "[a-z0-9]{2,4} [a-z0-9]{3,4}" + # + # a_df$postal_code <- ifelse(a_df$country %in% uk & + # grepl(pattern, a_df$city2),a_df$city2, + # a_df$postal_code) + + a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$state), + a_df$state, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city), + a_df$city, a_df$postal_code + ) + + a_df$postal_code <- ifelse(a_df$country %in% uk, + ifelse(!grepl("\\d", a_df$postal_code), NA, a_df$postal_code), + a_df$postal_code + ) + + a_df$city <- ifelse(a_df$country %in% uk & a_df$city == a_df$postal_code, + NA, a_df$city + ) + + a_df$state <- ifelse(a_df$country %in% uk & a_df$state == a_df$postal_code, + NA, a_df$state + ) + + + a_df$state <- ifelse(a_df$country == "england", a_df$city, a_df$state) + a_df$city <- ifelse(a_df$country == "england", NA, a_df$city) + a_df$city <- ifelse(a_df$country == "england", a_df$postal_code, a_df$city) + a_df$city <- ifelse(a_df$country == "england", + gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), + a_df$city + ) + + # TODO: england still needs work + + a_df$state <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", NA, a_df$state) + a_df$state <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales" & is.na(a_df$state), a_df$city, a_df$state) + a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", a_df$postal_code, a_df$city) + a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales" & is.na(a_df$city), a_df$city2, a_df$city) + a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", + gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), + a_df$city + ) + + + # postal codes clean uk --------------------------------------------------- + + + # Define the function + keep_numerical_parts <- function(df, control_col, country, target_col) { + # Apply the function to each row using sapply or a loop + df[[target_col]] <- sapply(1:nrow(df), function(i) { + if (df[[control_col]][i] == country) { + # Use gregexpr to find all parts of the string that include a numeral + matches <- gregexpr("\\b\\S*\\d\\S*\\b", df[[target_col]][i]) + # Extract the matched parts + result <- regmatches(df[[target_col]][i], matches) + # Combine the matched parts into a single string + result <- unlist(result) + result <- paste(result, collapse = " ") + result <- gsub(" ", "", result) + return(result) + } else { + return(df[[target_col]][i]) + } + }) + + return(df) + } + + + a_df <- keep_numerical_parts(a_df, "country", "scotland", "postal_code") + a_df <- keep_numerical_parts(a_df, "country", "england", "postal_code") + a_df <- keep_numerical_parts(a_df, "country", "northern ireland", "postal_code") + a_df <- keep_numerical_parts(a_df, "country", "wales", "postal_code") + + + + + + + + # INDIA clean-up ---------------------------------------------------------- + + + a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city), + a_df$city, a_df$postal_code + ) + + + a_df$city2 <- ifelse(a_df$country == "india" & a_df$state == a_df$city2, + a_df$state, a_df$city2 + ) + a_df$state <- ifelse(a_df$country == "india", NA, a_df$state) + a_df$state <- ifelse(a_df$country == "india" & is.na(a_df$postal_code), + a_df$city, a_df$state + ) + a_df$city <- ifelse(a_df$country == "india" & a_df$state == a_df$city, + NA, a_df$city + ) + a_df$city <- ifelse(a_df$country == "india" & grepl("[0-9]{4,10}", a_df$postal_code), + a_df$postal_code, a_df$city + ) + a_df$city <- ifelse(a_df$country == "india" & is.na(a_df$city), + a_df$city2, a_df$city + ) + + + a_df$postal_code <- ifelse(a_df$country == "india", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$city <- ifelse(a_df$country == "india", + gsub("[0-9]", "", a_df$city), + a_df$city + ) + + a_df$city <- ifelse(a_df$country == "india" & (grepl("delhi", a_df$city) | grepl("delhi", a_df$state)), + "new delhi", a_df$city + ) + + + # CHINA clean-up ---------------------------------------------------------- + + + + + + a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$city2), + a_df$city2, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$city), + a_df$city, a_df$postal_code + ) + a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$state), + a_df$state, a_df$postal_code + ) + + + a_df$city2 <- ifelse(a_df$country == "china" & a_df$state == a_df$city2, + a_df$state, a_df$city2 + ) + a_df$state <- ifelse(a_df$country == "china", NA, a_df$state) + a_df$state <- ifelse(a_df$country == "china" & is.na(a_df$postal_code), + a_df$city, a_df$state + ) + a_df$city <- ifelse(a_df$country == "china" & a_df$state == a_df$city, + NA, a_df$city + ) + a_df$city <- ifelse(a_df$country == "china" & grepl("[0-9]{4,10}", a_df$postal_code), + a_df$postal_code, a_df$city + ) + a_df$city <- ifelse(a_df$country == "china" & is.na(a_df$city), + a_df$city2, a_df$city + ) + + + a_df$postal_code <- ifelse(a_df$country == "china", + gsub("[A-Za-z]", "", a_df$postal_code), + a_df$postal_code + ) + a_df$city <- ifelse(a_df$country == "china", + gsub("[0-9]", "", a_df$city), + a_df$city + ) + + + + a_df$city <- ifelse(a_df$country == "china" & grepl("beijing", a_df$state), + "beijing", a_df$city + ) + + + # Define words indicating this is actually a dept not state or postal code + # will use this list to delete the ones that don't apply + to_delete <- c( + "&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program", "minist", "educ", "sch ", "grad ", "fac ", + "assoc", "forest" + ) + + + pattern <- paste(to_delete, collapse = "|") + # Apply the ifelse function to update + a_df$city <- ifelse(a_df$country == "china" & + grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), + NA, a_df$city) + a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), + a_df$state, a_df$city) + + + + a_df[] <- lapply(a_df, trimws) + + + + + # This verifies that what is in `city` is actually a city + # (or at least that what is in `city` is NOT a province) + + chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", + "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", + "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", + "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", + "gansu", "inner mongolia", "jilin", "hainan", "ningxia", + "qinghai", "tibet", "macao") + pattern <- paste(to_delete, collapse = "|") + a_df$city<- ifelse(a_df$country=="china" & + grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), + NA, a_df$city) + + + + # pc is letters dash numbers ---------------------------------------------- + + + pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" + + a_df$postal_code <- ifelse(grepl(pattern, a_df$city), + a_df$city, a_df$postal_code + ) + + a_df$postal_code <- ifelse(grepl(pattern, a_df$state), + a_df$state, a_df$postal_code + ) + + a_df$state <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), + a_df$city, a_df$state + ) + + + a_df$city <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), + a_df$postal_code, a_df$city + ) + + a_df$city2 <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$city), + NA, a_df$city2 + ) + + + + a_df$city <- ifelse(grepl(pattern, a_df$city), + gsub("[0-9]", "", a_df$city), + a_df$city + ) + a_df$city <- gsub("[a-z]{1,2}- ", "", a_df$city) + + + a_df$city <- gsub("[-]", "", a_df$city) + a_df[] <- lapply(a_df, trimws) + + + pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" + + a_df$postal_code <- ifelse(grepl(pattern, a_df$postal_code), + gsub("[a-z]", "", a_df$postal_code), + a_df$postal_code + ) + + a_df$postal_code <- gsub("[-]", "", a_df$postal_code) + a_df[] <- lapply(a_df, trimws) + + + # final check of postal codes (consecutive nos.) -------------------------- + + + # Define the function + extract_consecutive_numbers <- function(df, source, destination) { + df[[destination]] <- sapply(1:nrow(df), function(i) { + # Use gregexpr to find sequences of 4 or more consecutive numbers + if (is.na(df[[destination]][i])) { + matches <- gregexpr("\\d{4,}", df[[source]][i]) + # Extract the matched sequences + result <- regmatches(df[[source]][i], matches) + # Flatten the list of matches into a character vector + result <- unlist(result) + # Combine the matched sequences into a single string + result <- paste(result, collapse = " ") + return(result) + } else { + return(df[[destination]][i]) + } + }) + return(df) + } + + a_df <- extract_consecutive_numbers(a_df, "state", "postal_code") + + + + # clean the city ---------------------------------------------------------- + + # remove any digits + + a_df$city <- gsub("[0-9]", "", a_df$city) + + + + # clean up postal code ---------------------------------------------------- + + + a_df$postal_code <- ifelse(grepl("\\b[a-zA-Z]+\\s+[0-9]+\\b", a_df$postal_code), + gsub("\\b[a-zA-Z]+\\s", "", a_df$postal_code), + a_df$postal_code + ) + + # NETHERLANDS clean-up ---------------------------------------------------- + # cities often have two characters at start (ascii version of ligature/dipthong) + + a_df[] <- lapply(a_df, trimws) + a_df$city <- ifelse(a_df$country == "netherlands" & grepl("^[a-zA-Z]{2} ", a_df$city), + (sub("^[a-zA-Z]{2} ", "", a_df$city)), a_df$city + ) + + a_df[] <- lapply(a_df, trimws) + + + + # Final clean-up of some US cities and states ----------------------------- + + # Remove panama canal zone from usa states (for stri) + a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "cz"), + NA, a_df$state + ) + + # armed forces & diplomatic + a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "aa"), + NA, a_df$state + ) + + a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "apo"), + NA, a_df$city + ) + + a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "dpo"), + NA, a_df$city + ) + + a_df$city <- ifelse(a_df$city == "university pk", + "university park", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "college stn", + "college station", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "n chicago", + "north chicago", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "college pk", + "college park", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "research triangle pk" | a_df$city == "res triangle pk", + "research triangle park", + a_df$city + ) + + a_df$city <- ifelse(a_df$city == "state coll", + "state college", + a_df$city + ) + + + + a_df$city <- ifelse(grepl("sioux ctr", a_df$city), + (sub("sioux ctr", "sioux city", a_df$city)), a_df$city + ) + + + + a_df$city <- ifelse(grepl("sioux ctr", a_df$city), + (sub("sioux ctr", "sioux city", a_df$city)), a_df$city + ) + + + # Final clean-up of some Brazil cities and states ------------------------- + + + + a_df$city <- ifelse(a_df$city == "gavea rio de janeiro", + "rio de janeiro", + a_df$city + ) + + + a_df$city <- ifelse((a_df$country == "brazil" & a_df$city == "s jose campos"), + "sao jose dos campos", + a_df$city + ) + + a_df$city <- ifelse((a_df$country == "brazil" & (a_df$city == "rio de janerio" | + a_df$city == "rio de janiero" | + a_df$city == "rio der janeiro" | + a_df$city == "rio janeiro" | + a_df$city == "rio janiero")), + "rio de janeiro", + a_df$city + ) + + # Final clean-up of some INDIA cities and states -------------------------- + + + + a_df$city <- ifelse((a_df$city == "dehra dun" & a_df$country == "india"), + "dehradun", + a_df$city + ) + + + # Final clean-up of some CANADA cities and states ------------------------- + + + a_df$city <- ifelse((a_df$city == "st john" & a_df$country == "canada"), + "st. john's", + a_df$city + ) + + + # Final clean-up of some UK cities and states ----------------------------- + + + a_df$state <- ifelse(a_df$state == "london ", + "london", + a_df$state + ) + + a_df$city <- ifelse((a_df$state == "london" & a_df$country == "england"), + "london", + a_df$city + ) + + + # final clean-up of some MEXICO cities and states ------------------------- + + + a_df$city <- ifelse(a_df$country == "mexico" & a_df$city == "df", + "mexico city", + a_df$city + ) + + + # final clean-up of some ARGENTINA cities and states ---------------------- + + + + a_df$city <- ifelse(a_df$country == "argentina" & a_df$city == "df", + "buenos aires", a_df$city + ) + + + # final clean up of some ABBREVIATIONS in city names ---------------------- + + + a_df$city <- ifelse(grepl("^st ", a_df$city), + (sub("^st ", "saint ", a_df$city)), a_df$city + ) + + a_df$city <- ifelse(grepl(" st ", a_df$city), + (sub(" st ", " saint ", a_df$city)), a_df$city + ) + + a_df$city <- ifelse(grepl("^ste ", a_df$city), + (sub("^ste ", "saint ", a_df$city)), a_df$city + ) + + + + # removing departments etc allocated to city or state --------------------- + + # use strings of words typical of institutions or departmewnts to remove + + tech_words <- c( + " lab ", "lab ", " lab", "dept", "hosp", " inst", "inst ", "ctr", + "unit", "ltd", "minist", "educ", "grad ", " sch ", "sch ", " sch", + "coll ", " sci ", "natl", "&", " med", "med ", + "publ", "dept", "biomed", "phys", "technol", + "engn" + ) + pattern <- paste(tech_words, collapse = "|") + + a_df$city <- ifelse((a_df$city != "esch sur alzette" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + a_df$state, a_df$city + ) + + + a_df$state <- ifelse(a_df$state == a_df$city2, NA, a_df$state) + + a_df$state <- ifelse(grepl("[[:digit:]]", a_df$state), + NA, a_df$state + ) + + a_df$state <- ifelse(a_df$state == "", NA, a_df$state) + + a_df$postal_code <- ifelse(a_df$postal_code == "", NA, a_df$postal_code) + + + + # still some us states not extracting properly but fixed here ------------- + + + + us_state_abbreviations_lower <- c( + "al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", + "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", + "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", + "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", + "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy" + ) + pattern <- paste(us_state_abbreviations_lower, collapse = "|") + a_df$country_list <- country_list + a_df$state <- ifelse((a_df$country == "usa" & is.na(a_df$state) & grepl(pattern, a_df$address, ignore.case = TRUE, perl = TRUE)), + a_df$country_list, a_df$state + ) + + + a_df$state <- ifelse((a_df$country == "usa" & grepl("[[:digit:]]", a_df$state)), + gsub("[[:digit:]]", "", a_df$state), a_df$state + ) + a_df$state <- ifelse((a_df$country == "usa" & grepl("usa", a_df$state)), + gsub("usa", "", a_df$state), a_df$state + ) + a_df$state <- trimws(a_df$state, which = "both") + + + + + # Japanese prefectures & cities sometimes swapped in address -------------- + + + + to_delete <- c( + "&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program", "minist", "educ", "sch ", "grad ", "fac ", + "assoc", "forest", "corp" + ) + pattern <- paste(to_delete, collapse = "|") + a_df$city2 <- ifelse((a_df$country == "japan" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA, a_df$city2 + ) + + # Remove any with numbers + a_df$city2 <- ifelse((a_df$country == "japan" & grepl("[[:digit:]]", a_df$city2)), + NA, a_df$city2 + ) + + japan_prefectures <- c( + "hokkaido", "aomori", "iwate", "miyagi", "akita", + "yamagata", "fukushima", "ibaraki", "tochigi", "gunma", + "saitama", "chiba", "tokyo", "kanagawa", "niigata", + "toyama", "ishikawa", "fukui", "yamanashi", "nagano", "gifu", + "shizuoka", "aichi", "mie", "shiga", "kyoto", "osaka", "gumma", + "hyogo", "nara", "wakayama", "tottori", "shimane", + "okayama", "hiroshima", "yamaguchi", "tokushima", "kagawa", + "ehime", "kochi", "fukuoka", "saga", "nagasaki", "kumamoto", + "oita", "miyazaki", "kagoshima", "okinawa" + ) + pattern <- paste(japan_prefectures, collapse = "|") + + + a_df$state <- ifelse((a_df$country == "japan" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + a_df$city, a_df$state + ) + + + # This removes all special regions of a city like tokyo from city2 + a_df$city2 <- ifelse((a_df$country == "japan" & grepl(" ku", a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA, a_df$city2 + ) + + # replace from city with city2 EXCEPT in cases where no state (and therefore + # city is correct) and where no city2 (otherwise would bring in NA) + + a_df$city <- ifelse((a_df$country == "japan" & !(is.na(a_df$state)) & !(is.na(a_df$city2))), + a_df$city2, a_df$city + ) + + + + # fine-tuning SCOTLAND ---------------------------------------------------- + + + + a_df$city <- ifelse((a_df$country == "scotland" & grepl("univ ", a_df$city, ignore.case = TRUE, perl = TRUE)), + gsub("univ ", "", a_df$city), a_df$city + ) + + to_delete <- c( + " ave", " grp", "hlth", " rd", "mrc", " oba", "plz", + " dr", "oqb", " quad", "fisheries" + ) + + pattern <- paste(to_delete, collapse = "|") + a_df$city <- ifelse((a_df$country == "scotland" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + NA, a_df$city + ) + + + + # fine-tuning ENGLAND ----------------------------------------------------- + + + to_delete <- c( + "&", "inst", "ctr", "med", "chem", "lab", "biol", + "dept", "div", "univ", "hosp", "coll", "sci", "rd", + "program", "minist", "educ", "sch ", "grad ", "fac ", + " sq", "quarter", " way", " dr", "diagnost", "consultant", + "microsoft", "diagnost", "[[:digit:]]", "project", "facil", "grp", + "campus", "expt", " pk", "canc", "assoc", "forest", "corp", + "consortium", "partners", "lane", "ucl", "street", "trust", + "business", "inform", "royal", "survey", "drosophila", " st", + "ndorms", "nat hist", "hlth", " ave", "council", "unit", "nerc", "nat res" + ) + pattern <- paste(to_delete, collapse = "|") + a_df$city2 <- ifelse((a_df$country == "england" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA, a_df$city2 + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)), + a_df$city2, a_df$city + ) + + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("london", a_df$address, ignore.case = TRUE, perl = TRUE), + "london", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("cambridge", a_df$address, ignore.case = TRUE, perl = TRUE), + "cambridge", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("oxford", a_df$address, ignore.case = TRUE, perl = TRUE), + "oxford", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("durham", a_df$address, ignore.case = TRUE, perl = TRUE), + "durham", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("bristol", a_df$address, ignore.case = TRUE, perl = TRUE), + "bristol", a_df$city + ) + + a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & + grepl("lancaster", a_df$address, ignore.case = TRUE, perl = TRUE), + "lancaster", a_df$city + ) + + + + # delete columns used to 2x check ---------------------------------------- + + a_df$city2 <- NULL + a_df$country_list <- NULL + + + # return output of function ----------------------------------------------- + + return(a_df) } diff --git a/R/authors_address_update.R b/R/authors_address_update.R deleted file mode 100644 index bbc68e0..0000000 --- a/R/authors_address_update.R +++ /dev/null @@ -1,1133 +0,0 @@ -#' Parses out address information and splits it into its respective parts. -#' This is an internal function used by \code{authors_clean} -#' -#' \code{authors_address} This function takes the output from -#' \code{references_read} and pulls out address information. Splitting it into -#' university, department, city, state, etc. -#' @param addresses the addresses -#' @param ID the authorID -#' @noRd -authors_address <- function(addresses, ID) { - - addresses <- tolower(addresses) - - message("\nSplitting addresses\n") - list_address <- strsplit(addresses, ",") - - - - # remove punctuation ---------------------------------------- - - ## First remove periods and trim white space from countries. - ## helps avoids mistakes later on - - remove_period_from_last <- function(list_address) { - lapply(list_address, function(x) { - if (length(x) > 0) { - x[length(x)] <- gsub("\\.$", "", x[length(x)]) - x[length(x)] <- trimws(x[length(x)], which = "both") - } - return(x) - }) - } - - list_address <- remove_period_from_last(list_address) - - # trim ws ----------------------------------------------------------------- - - list_address <- lapply(list_address, trimws) - - # correct countries ------------------------------------------------------- - - - # correct or update names of some countries to make it possible to georef - - # Define the function - correct_countries <- function(my_list, replacements) { - # Loop through each element of the list - for (i in 1:length(my_list)) { - # Get the length of the current element - len <- length(my_list[[i]]) - - # Check if the last item matches any of the target words - if (len > 0 && my_list[[i]][len] %in% names(replacements)) { - # Replace the last item with the corresponding replacement word - my_list[[i]][len] <- replacements[[my_list[[i]][len]]] - } - } - return(my_list) - } - # NB: also updated country names. - # czechia = new name for czech republic - # united arab rep = current country name depends on city - replacements <- c( - "austl" = "australia", - "c z" = "czechia", - "cz" = "czechia", - "czech republic" = "czechia", - "fed rep ger" = "germany", - "columbia" = "colombia", - "peoples r china" = "china", - "u arab emirates" = "united arab emirates", - "mongol peo rep" = "mongolia", - "dominican rep" = "dominican republic", - "fr polynesia" = "french polynesia", - "neth antilles" = "netherland antilles", - "trinid & tobago" = "trinidad & tobago", - "rep congo" = "congo", - "north ireland" = "northern ireland", - "syrian arab rep" = "syria" - ) - - list_address <- correct_countries(list_address, replacements) - - # extract university ------------------------------------------------------ - - university_list <- vapply(list_address, function(x) x[1], character(1)) - - # extract department ------------------------------------------------------ - - # If department is listed it is typically second - # (EB note: only if 4+ slots) - # this will be 2x checked later - - dept_extract <- function(x) { - if (length(x) < 4) { - return(NA) - } else { - return(trimws(x[[2]])) - } - } - - dept_list <- unlist(lapply(list_address, dept_extract)) - - dept_list <- trimws(dept_list, which = "both") - - - # Extract City ------------------------------------------------------------ - - # If there is only one element, then it can't have both city and country' - city_list <- vapply(list_address, function(x) { - n <- length(x) - if (n == 1) { - return("no city") # placeholder to replace with NA after function - } - - # In some cases city is next-to-last element, in others next-to-next-to-last - last_element <- x[[n]] - second_last <- if (n > 1) x[[n - 1]] else NA - third_last <- if (n > 2) x[[n - 2]] else NA - - # Default case - return(second_last) - }, character(1)) - - # Cleanup - city_list <- trimws(city_list, which = "both") - city_list[city_list == "no city"] <- NA - - - # extract state ----------------------------------------------------------- - - # If there is only one element, then it can't have both city and country' - state_list <- vapply(list_address, function(x) { - n <- length(x) - if (n == 1) { - return("no state") # placeholder to replace with NA after function - } - - # In some cases city is next-to-last element, in others next-to-next-to-last - last_element <- x[[n]] - second_last <- if (n > 1) x[[n - 1]] else NA - third_last <- if (n > 2) x[[n - 2]] else NA - - # Default case - return(third_last) - }, character(1)) - - # Cleanup - state_list <- trimws(state_list, which = "both") - state_list[state_list == "no state"] <- NA - - # this is used to double check later - sometimes city is extracted as state - city_list2 <- trimws(state_list, which = "both") - - # Extract Country --------------------------------------------------------- - - country_list <- vapply( - list_address, function(x) { - gsub("\\_", "", x[length(x)]) - }, - character(1) - ) - - - # postal code (pc) list --------------------------------------------------- - - # pc often with city - - pc_list <- city_list - - # bind all into df -------------------------------------------------------- - - a_df <- data.frame( - adID = ID, - university = university_list, - country = country_list, - state = state_list, - postal_code = pc_list, - city = city_list, - city2 = city_list2, - department = dept_list, - address = addresses, - stringsAsFactors = FALSE - ) - - - - # any PC without numbers gets NA'd - a_df$postal_code[!grepl("\\d", a_df$postal_code)] <- NA - - # copy over PC and state - a_df$state <- ifelse(grepl("usa", a_df$country) & nchar(a_df$state) > 2, - NA, - a_df$state - ) - - - a_df$postal_code <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), - a_df$country, a_df$postal_code - ) - - a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5} usa", a_df$country), - a_df$country, a_df$state - ) - - a_df$state <- ifelse(grepl("[a-z]{2} [0-9]{5}", a_df$city), - a_df$city, a_df$state - ) - - - a_df$state <- ifelse(grepl("[a-z]{2} usa", a_df$country), - a_df$country, a_df$state - ) - - # remove the numbers and letters as appropriate - - - a_df$country <- ifelse(grepl(" usa", a_df$country), - "usa", a_df$country - ) - - a_df$state <- ifelse(a_df$country == "usa" & grepl("[a-z]{2} [0-9]{5}", a_df$state), - gsub("[[:digit:]]{5}", "", a_df$state), a_df$state - ) - - a_df$state <- ifelse(a_df$country == "usa" & grepl(" usa", a_df$state), - gsub(" usa", "", a_df$state), a_df$state - ) - - - a_df$postal_code <- ifelse(a_df$country == "usa", - gsub( - "[[:alpha:]]{2} ", "", - a_df$postal_code - ), a_df$postal_code - ) - - a_df$postal_code <- ifelse(a_df$country == "usa", - gsub( - " usa", "", - a_df$postal_code - ), a_df$postal_code - ) - - - - - a_df$city <- ifelse(a_df$country == "usa" & grepl("[a-z]{2} [0-9]{5}", a_df$city), - a_df$city2, a_df$city - ) - - - pattern <- "[a-z]{2} [0-9]{5}" - - a_df$postal_code <- ifelse(grepl(pattern, a_df$country), - a_df$country, a_df$postal_code - ) - a_df$state <- ifelse(grepl(pattern, a_df$country), - a_df$country, a_df$state - ) - a_df$country <- ifelse(grepl(pattern, a_df$country), - "usa", a_df$country - ) - a_df$postal_code <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$postal_code), - gsub("[a-z]", "", a_df$postal_code), a_df$postal_code - ) - a_df$state <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$state), - gsub("[0-9]", "", a_df$postal_code), a_df$state - ) - - - - # BRAZIL clean-up --------------------------------------------------------- - - a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2, - a_df$city, a_df$state - ) - a_df$city <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2, - a_df$city2, a_df$city - ) - a_df$city2 <- ifelse(a_df$country == "brazil" & a_df$city == a_df$city2, - NA, a_df$city2 - ) - a_df$postal_code <- ifelse(a_df$country == "brazil" & is.na(a_df$postal_code), - a_df$city, a_df$postal_code - ) - a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$state) > 2, - NA, a_df$state - ) - - a_df$postal_code <- ifelse(a_df$country == "brazil", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code - ) - a_df$postal_code <- ifelse(a_df$country == "brazil", - gsub("[-]", "", a_df$postal_code), - a_df$postal_code - ) - - a_df$city <- ifelse(a_df$country == "brazil", - gsub("br-", "", a_df$city), - a_df$city - ) - a_df$city <- ifelse(a_df$country == "brazil", - gsub("[0-9]", "", a_df$city), - a_df$city - ) - - - a_df$state <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), - a_df$city, a_df$state - ) - a_df$postal_code <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), - a_df$city2, a_df$postal_code - ) - a_df$city <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), - a_df$city2, a_df$city - ) - - - # repeat the clean of city - a_df$city <- ifelse(a_df$country == "brazil", - gsub("br-", "", a_df$city), - a_df$city - ) - a_df$city <- ifelse(a_df$country == "brazil", - gsub("[0-9]", "", a_df$city), - a_df$city - ) - - - a_df$postal_code <- ifelse(a_df$country == "brazil", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code - ) - a_df$postal_code <- ifelse(a_df$country == "brazil", - gsub("[-]", "", a_df$postal_code), - a_df$postal_code - ) - - a_df[] <- lapply(a_df, trimws) - - - # Define city-to-state mapping - city_state_mapping <- data.frame( - city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), - state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), - stringsAsFactors = FALSE - ) - - # Match cities and states - for (i in 1:nrow(city_state_mapping)) { - a_df$city <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), - city_state_mapping$city[i], a_df$city - ) - a_df$state <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), - city_state_mapping$state[i], a_df$state - ) - } - - # Match cities and states - for (i in 1:nrow(city_state_mapping)) { - a_df$state <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$city, ignore.case = TRUE), - city_state_mapping$state[i], a_df$state - ) - } - - # AUSTRALIA clean-up--------------------------------------------------------------- - - a_df$state <- ifelse(a_df$country == "australia", - a_df$city, a_df$state - ) - a_df$postal_code <- ifelse(a_df$country == "australia", - a_df$city, a_df$postal_code - ) - a_df$city <- ifelse(a_df$country == "australia", - a_df$city2, a_df$city - ) - a_df$city2 <- ifelse(a_df$country == "australia", - NA, a_df$city2 - ) - - - a_df$postal_code <- ifelse(a_df$country == "australia", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code - ) - a_df$state <- ifelse(a_df$country == "australia", - gsub("[0-9]", "", a_df$state), - a_df$state - ) - - a_df[] <- lapply(a_df, trimws) - - - - # CANADA clean-up --------------------------------------------------------- - - a_df$state <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2, - a_df$city, a_df$state - ) - - a_df$city <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2, - NA, a_df$city - ) - - a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), - a_df$city2, a_df$postal_code - ) - a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), - a_df$city2, a_df$postal_code - ) - - a_df$state <- ifelse(a_df$country == "canada" & a_df$city2 == a_df$state, - NA, a_df$state - ) - - a_df$city <- ifelse(a_df$country == "canada", a_df$city2, a_df$city) - a_df$city2 <- ifelse(a_df$country == "canada", NA, a_df$city2) - - - a_df$city <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city), - gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b", "", a_df$city), - a_df$city - ) - - a_df$state <- ifelse(a_df$country == "canada" & is.na(a_df$state), - a_df$postal_code, - a_df$state - ) - - a_df$state <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$state), - gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b", "", a_df$state), - a_df$state - ) - - a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{2,20})\\b \\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$postal_code), - gsub("\\b(\\w{1,2}|\\w{4,})\\b", "", a_df$postal_code), - a_df$postal_code - ) - - a_df[] <- lapply(a_df, trimws) - - # TODO: a few postal codes still have letters from city - - - a_df$postal_code <- ifelse(a_df$country == "canada", gsub(" ", "", a_df$postal_code), a_df$postal_code) - - - # UK clean-up ------------------------------------------------------------- - - uk <- c("scotland", "england", "wales", "northern ireland") - pattern <- "[a-z0-9]{2,4} [a-z0-9]{3,4}" - # - # a_df$postal_code <- ifelse(a_df$country %in% uk & - # grepl(pattern, a_df$city2),a_df$city2, - # a_df$postal_code) - - a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city2), - a_df$city2, a_df$postal_code - ) - a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$state), - a_df$state, a_df$postal_code - ) - a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city), - a_df$city, a_df$postal_code - ) - - a_df$postal_code <- ifelse(a_df$country %in% uk, - ifelse(!grepl("\\d", a_df$postal_code), NA, a_df$postal_code), - a_df$postal_code - ) - - a_df$city <- ifelse(a_df$country %in% uk & a_df$city == a_df$postal_code, - NA, a_df$city - ) - - a_df$state <- ifelse(a_df$country %in% uk & a_df$state == a_df$postal_code, - NA, a_df$state - ) - - - a_df$state <- ifelse(a_df$country == "england", a_df$city, a_df$state) - a_df$city <- ifelse(a_df$country == "england", NA, a_df$city) - a_df$city <- ifelse(a_df$country == "england", a_df$postal_code, a_df$city) - a_df$city <- ifelse(a_df$country == "england", - gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), - a_df$city - ) - - # TODO: england still needs work - - a_df$state <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", NA, a_df$state) - a_df$state <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales" & is.na(a_df$state), a_df$city, a_df$state) - a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", a_df$postal_code, a_df$city) - a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales" & is.na(a_df$city), a_df$city2, a_df$city) - a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", - gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), - a_df$city - ) - - - # postal codes clean uk --------------------------------------------------- - - - # Define the function - keep_numerical_parts <- function(df, control_col, country, target_col) { - # Apply the function to each row using sapply or a loop - df[[target_col]] <- sapply(1:nrow(df), function(i) { - if (df[[control_col]][i] == country) { - # Use gregexpr to find all parts of the string that include a numeral - matches <- gregexpr("\\b\\S*\\d\\S*\\b", df[[target_col]][i]) - # Extract the matched parts - result <- regmatches(df[[target_col]][i], matches) - # Combine the matched parts into a single string - result <- unlist(result) - result <- paste(result, collapse = " ") - result <- gsub(" ", "", result) - return(result) - } else { - return(df[[target_col]][i]) - } - }) - - return(df) - } - - - a_df <- keep_numerical_parts(a_df, "country", "scotland", "postal_code") - a_df <- keep_numerical_parts(a_df, "country", "england", "postal_code") - a_df <- keep_numerical_parts(a_df, "country", "northern ireland", "postal_code") - a_df <- keep_numerical_parts(a_df, "country", "wales", "postal_code") - - - - - - - - # INDIA clean-up ---------------------------------------------------------- - - - a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city2), - a_df$city2, a_df$postal_code - ) - a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city), - a_df$city, a_df$postal_code - ) - - - a_df$city2 <- ifelse(a_df$country == "india" & a_df$state == a_df$city2, - a_df$state, a_df$city2 - ) - a_df$state <- ifelse(a_df$country == "india", NA, a_df$state) - a_df$state <- ifelse(a_df$country == "india" & is.na(a_df$postal_code), - a_df$city, a_df$state - ) - a_df$city <- ifelse(a_df$country == "india" & a_df$state == a_df$city, - NA, a_df$city - ) - a_df$city <- ifelse(a_df$country == "india" & grepl("[0-9]{4,10}", a_df$postal_code), - a_df$postal_code, a_df$city - ) - a_df$city <- ifelse(a_df$country == "india" & is.na(a_df$city), - a_df$city2, a_df$city - ) - - - a_df$postal_code <- ifelse(a_df$country == "india", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code - ) - a_df$city <- ifelse(a_df$country == "india", - gsub("[0-9]", "", a_df$city), - a_df$city - ) - - a_df$city <- ifelse(a_df$country == "india" & (grepl("delhi", a_df$city) | grepl("delhi", a_df$state)), - "new delhi", a_df$city - ) - - - # CHINA clean-up ---------------------------------------------------------- - - - - - - a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$city2), - a_df$city2, a_df$postal_code - ) - a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$city), - a_df$city, a_df$postal_code - ) - a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$state), - a_df$state, a_df$postal_code - ) - - - a_df$city2 <- ifelse(a_df$country == "china" & a_df$state == a_df$city2, - a_df$state, a_df$city2 - ) - a_df$state <- ifelse(a_df$country == "china", NA, a_df$state) - a_df$state <- ifelse(a_df$country == "china" & is.na(a_df$postal_code), - a_df$city, a_df$state - ) - a_df$city <- ifelse(a_df$country == "china" & a_df$state == a_df$city, - NA, a_df$city - ) - a_df$city <- ifelse(a_df$country == "china" & grepl("[0-9]{4,10}", a_df$postal_code), - a_df$postal_code, a_df$city - ) - a_df$city <- ifelse(a_df$country == "china" & is.na(a_df$city), - a_df$city2, a_df$city - ) - - - a_df$postal_code <- ifelse(a_df$country == "china", - gsub("[A-Za-z]", "", a_df$postal_code), - a_df$postal_code - ) - a_df$city <- ifelse(a_df$country == "china", - gsub("[0-9]", "", a_df$city), - a_df$city - ) - - - - a_df$city <- ifelse(a_df$country == "china" & grepl("beijing", a_df$state), - "beijing", a_df$city - ) - - - # Define words indicating this is actually a dept not state or postal code - # will use this list to delete the ones that don't apply - to_delete <- c( - "&", "inst", "ctr", "med", "chem", "lab", "biol", - "dept", "div", "univ", "hosp", "coll", "sci", "rd", - "program", "minist", "educ", "sch ", "grad ", "fac ", - "assoc", "forest" - ) - - - pattern <- paste(to_delete, collapse = "|") - # Apply the ifelse function to update - a_df$city <- ifelse(a_df$country == "china" & - grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), - NA, a_df$city) - a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), - a_df$state, a_df$city) - - - - a_df[] <- lapply(a_df, trimws) - - - - - # This verifies that what is in `city` is actually a city - # (or at least that what is in `city` is NOT a province) - - chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", - "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", - "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", - "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", - "gansu", "inner mongolia", "jilin", "hainan", "ningxia", - "qinghai", "tibet", "macao") - pattern <- paste(to_delete, collapse = "|") - a_df$city<- ifelse(a_df$country=="china" & - grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), - NA, a_df$city) - - - - # pc is letters dash numbers ---------------------------------------------- - - - pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" - - a_df$postal_code <- ifelse(grepl(pattern, a_df$city), - a_df$city, a_df$postal_code - ) - - a_df$postal_code <- ifelse(grepl(pattern, a_df$state), - a_df$state, a_df$postal_code - ) - - a_df$state <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), - a_df$city, a_df$state - ) - - - a_df$city <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), - a_df$postal_code, a_df$city - ) - - a_df$city2 <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$city), - NA, a_df$city2 - ) - - - - a_df$city <- ifelse(grepl(pattern, a_df$city), - gsub("[0-9]", "", a_df$city), - a_df$city - ) - a_df$city <- gsub("[a-z]{1,2}- ", "", a_df$city) - - - a_df$city <- gsub("[-]", "", a_df$city) - a_df[] <- lapply(a_df, trimws) - - - pattern <- "\\b[A-Za-z]{1,3}[-][0-9]{3,8}\\b" - - a_df$postal_code <- ifelse(grepl(pattern, a_df$postal_code), - gsub("[a-z]", "", a_df$postal_code), - a_df$postal_code - ) - - a_df$postal_code <- gsub("[-]", "", a_df$postal_code) - a_df[] <- lapply(a_df, trimws) - - - # final check of postal codes (consecutive nos.) -------------------------- - - - # Define the function - extract_consecutive_numbers <- function(df, source, destination) { - df[[destination]] <- sapply(1:nrow(df), function(i) { - # Use gregexpr to find sequences of 4 or more consecutive numbers - if (is.na(df[[destination]][i])) { - matches <- gregexpr("\\d{4,}", df[[source]][i]) - # Extract the matched sequences - result <- regmatches(df[[source]][i], matches) - # Flatten the list of matches into a character vector - result <- unlist(result) - # Combine the matched sequences into a single string - result <- paste(result, collapse = " ") - return(result) - } else { - return(df[[destination]][i]) - } - }) - return(df) - } - - a_df <- extract_consecutive_numbers(a_df, "state", "postal_code") - - - - # clean the city ---------------------------------------------------------- - - # remove any digits - - a_df$city <- gsub("[0-9]", "", a_df$city) - - - - # clean up postal code ---------------------------------------------------- - - - a_df$postal_code <- ifelse(grepl("\\b[a-zA-Z]+\\s+[0-9]+\\b", a_df$postal_code), - gsub("\\b[a-zA-Z]+\\s", "", a_df$postal_code), - a_df$postal_code - ) - - # NETHERLANDS clean-up ---------------------------------------------------- - # cities often have two characters at start (ascii version of ligature/dipthong) - - a_df[] <- lapply(a_df, trimws) - a_df$city <- ifelse(a_df$country == "netherlands" & grepl("^[a-zA-Z]{2} ", a_df$city), - (sub("^[a-zA-Z]{2} ", "", a_df$city)), a_df$city - ) - - a_df[] <- lapply(a_df, trimws) - - - - # Final clean-up of some US cities and states ----------------------------- - - # Remove panama canal zone from usa states (for stri) - a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "cz"), - NA, a_df$state - ) - - # armed forces & diplomatic - a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "aa"), - NA, a_df$state - ) - - a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "apo"), - NA, a_df$city - ) - - a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "dpo"), - NA, a_df$city - ) - - a_df$city <- ifelse(a_df$city == "university pk", - "university park", - a_df$city - ) - - a_df$city <- ifelse(a_df$city == "college stn", - "college station", - a_df$city - ) - - a_df$city <- ifelse(a_df$city == "n chicago", - "north chicago", - a_df$city - ) - - a_df$city <- ifelse(a_df$city == "college pk", - "college park", - a_df$city - ) - - a_df$city <- ifelse(a_df$city == "research triangle pk" | a_df$city == "res triangle pk", - "research triangle park", - a_df$city - ) - - a_df$city <- ifelse(a_df$city == "state coll", - "state college", - a_df$city - ) - - - - a_df$city <- ifelse(grepl("sioux ctr", a_df$city), - (sub("sioux ctr", "sioux city", a_df$city)), a_df$city - ) - - - - a_df$city <- ifelse(grepl("sioux ctr", a_df$city), - (sub("sioux ctr", "sioux city", a_df$city)), a_df$city - ) - - - # Final clean-up of some Brazil cities and states ------------------------- - - - - a_df$city <- ifelse(a_df$city == "gavea rio de janeiro", - "rio de janeiro", - a_df$city - ) - - - a_df$city <- ifelse((a_df$country == "brazil" & a_df$city == "s jose campos"), - "sao jose dos campos", - a_df$city - ) - - a_df$city <- ifelse((a_df$country == "brazil" & (a_df$city == "rio de janerio" | - a_df$city == "rio de janiero" | - a_df$city == "rio der janeiro" | - a_df$city == "rio janeiro" | - a_df$city == "rio janiero")), - "rio de janeiro", - a_df$city - ) - - # Final clean-up of some INDIA cities and states -------------------------- - - - - a_df$city <- ifelse((a_df$city == "dehra dun" & a_df$country == "india"), - "dehradun", - a_df$city - ) - - - # Final clean-up of some CANADA cities and states ------------------------- - - - a_df$city <- ifelse((a_df$city == "st john" & a_df$country == "canada"), - "st. john's", - a_df$city - ) - - - # Final clean-up of some UK cities and states ----------------------------- - - - a_df$state <- ifelse(a_df$state == "london ", - "london", - a_df$state - ) - - a_df$city <- ifelse((a_df$state == "london" & a_df$country == "england"), - "london", - a_df$city - ) - - - # final clean-up of some MEXICO cities and states ------------------------- - - - a_df$city <- ifelse(a_df$country == "mexico" & a_df$city == "df", - "mexico city", - a_df$city - ) - - - # final clean-up of some ARGENTINA cities and states ---------------------- - - - - a_df$city <- ifelse(a_df$country == "argentina" & a_df$city == "df", - "buenos aires", a_df$city - ) - - - # final clean up of some ABBREVIATIONS in city names ---------------------- - - - a_df$city <- ifelse(grepl("^st ", a_df$city), - (sub("^st ", "saint ", a_df$city)), a_df$city - ) - - a_df$city <- ifelse(grepl(" st ", a_df$city), - (sub(" st ", " saint ", a_df$city)), a_df$city - ) - - a_df$city <- ifelse(grepl("^ste ", a_df$city), - (sub("^ste ", "saint ", a_df$city)), a_df$city - ) - - - - # removing departments etc allocated to city or state --------------------- - - # use strings of words typical of institutions or departmewnts to remove - - tech_words <- c( - " lab ", "lab ", " lab", "dept", "hosp", " inst", "inst ", "ctr", - "unit", "ltd", "minist", "educ", "grad ", " sch ", "sch ", " sch", - "coll ", " sci ", "natl", "&", " med", "med ", - "publ", "dept", "biomed", "phys", "technol", - "engn" - ) - pattern <- paste(tech_words, collapse = "|") - - a_df$city <- ifelse((a_df$city != "esch sur alzette" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), - a_df$state, a_df$city - ) - - - a_df$state <- ifelse(a_df$state == a_df$city2, NA, a_df$state) - - a_df$state <- ifelse(grepl("[[:digit:]]", a_df$state), - NA, a_df$state - ) - - a_df$state <- ifelse(a_df$state == "", NA, a_df$state) - - a_df$postal_code <- ifelse(a_df$postal_code == "", NA, a_df$postal_code) - - - - # still some us states not extracting properly but fixed here ------------- - - - - us_state_abbreviations_lower <- c( - "al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", - "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", - "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", - "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc", - "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy" - ) - pattern <- paste(us_state_abbreviations_lower, collapse = "|") - a_df$country_list <- country_list - a_df$state <- ifelse((a_df$country == "usa" & is.na(a_df$state) & grepl(pattern, a_df$address, ignore.case = TRUE, perl = TRUE)), - a_df$country_list, a_df$state - ) - - - a_df$state <- ifelse((a_df$country == "usa" & grepl("[[:digit:]]", a_df$state)), - gsub("[[:digit:]]", "", a_df$state), a_df$state - ) - a_df$state <- ifelse((a_df$country == "usa" & grepl("usa", a_df$state)), - gsub("usa", "", a_df$state), a_df$state - ) - a_df$state <- trimws(a_df$state, which = "both") - - - - - # Japanese prefectures & cities sometimes swapped in address -------------- - - - - to_delete <- c( - "&", "inst", "ctr", "med", "chem", "lab", "biol", - "dept", "div", "univ", "hosp", "coll", "sci", "rd", - "program", "minist", "educ", "sch ", "grad ", "fac ", - "assoc", "forest", "corp" - ) - pattern <- paste(to_delete, collapse = "|") - a_df$city2 <- ifelse((a_df$country == "japan" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA, a_df$city2 - ) - - # Remove any with numbers - a_df$city2 <- ifelse((a_df$country == "japan" & grepl("[[:digit:]]", a_df$city2)), - NA, a_df$city2 - ) - - japan_prefectures <- c( - "hokkaido", "aomori", "iwate", "miyagi", "akita", - "yamagata", "fukushima", "ibaraki", "tochigi", "gunma", - "saitama", "chiba", "tokyo", "kanagawa", "niigata", - "toyama", "ishikawa", "fukui", "yamanashi", "nagano", "gifu", - "shizuoka", "aichi", "mie", "shiga", "kyoto", "osaka", "gumma", - "hyogo", "nara", "wakayama", "tottori", "shimane", - "okayama", "hiroshima", "yamaguchi", "tokushima", "kagawa", - "ehime", "kochi", "fukuoka", "saga", "nagasaki", "kumamoto", - "oita", "miyazaki", "kagoshima", "okinawa" - ) - pattern <- paste(japan_prefectures, collapse = "|") - - - a_df$state <- ifelse((a_df$country == "japan" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), - a_df$city, a_df$state - ) - - - # This removes all special regions of a city like tokyo from city2 - a_df$city2 <- ifelse((a_df$country == "japan" & grepl(" ku", a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA, a_df$city2 - ) - - # replace from city with city2 EXCEPT in cases where no state (and therefore - # city is correct) and where no city2 (otherwise would bring in NA) - - a_df$city <- ifelse((a_df$country == "japan" & !(is.na(a_df$state)) & !(is.na(a_df$city2))), - a_df$city2, a_df$city - ) - - - - # fine-tuning SCOTLAND ---------------------------------------------------- - - - - a_df$city <- ifelse((a_df$country == "scotland" & grepl("univ ", a_df$city, ignore.case = TRUE, perl = TRUE)), - gsub("univ ", "", a_df$city), a_df$city - ) - - to_delete <- c( - " ave", " grp", "hlth", " rd", "mrc", " oba", "plz", - " dr", "oqb", " quad", "fisheries" - ) - - pattern <- paste(to_delete, collapse = "|") - a_df$city <- ifelse((a_df$country == "scotland" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), - NA, a_df$city - ) - - - - # fine-tuning ENGLAND ----------------------------------------------------- - - - to_delete <- c( - "&", "inst", "ctr", "med", "chem", "lab", "biol", - "dept", "div", "univ", "hosp", "coll", "sci", "rd", - "program", "minist", "educ", "sch ", "grad ", "fac ", - " sq", "quarter", " way", " dr", "diagnost", "consultant", - "microsoft", "diagnost", "[[:digit:]]", "project", "facil", "grp", - "campus", "expt", " pk", "canc", "assoc", "forest", "corp", - "consortium", "partners", "lane", "ucl", "street", "trust", - "business", "inform", "royal", "survey", "drosophila", " st", - "ndorms", "nat hist", "hlth", " ave", "council", "unit", "nerc", "nat res" - ) - pattern <- paste(to_delete, collapse = "|") - a_df$city2 <- ifelse((a_df$country == "england" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA, a_df$city2 - ) - - a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)), - a_df$city2, a_df$city - ) - - - a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & - grepl("london", a_df$address, ignore.case = TRUE, perl = TRUE), - "london", a_df$city - ) - - a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & - grepl("cambridge", a_df$address, ignore.case = TRUE, perl = TRUE), - "cambridge", a_df$city - ) - - a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & - grepl("oxford", a_df$address, ignore.case = TRUE, perl = TRUE), - "oxford", a_df$city - ) - - a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & - grepl("durham", a_df$address, ignore.case = TRUE, perl = TRUE), - "durham", a_df$city - ) - - a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & - grepl("bristol", a_df$address, ignore.case = TRUE, perl = TRUE), - "bristol", a_df$city - ) - - a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & - grepl("lancaster", a_df$address, ignore.case = TRUE, perl = TRUE), - "lancaster", a_df$city - ) - - - - # delete columns used to 2x check ---------------------------------------- - - a_df$city2 <- NULL - a_df$country_list <- NULL - - - # return output of function ----------------------------------------------- - - - return(a_df) -} From b7ad8b1ca5e49d273d33ea5ac439e66353108c46 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 21 Mar 2025 11:57:13 -0400 Subject: [PATCH 17/34] formatting, added default to function call --- R/authors_georef.R | 318 +++++++++++++++++++++++---------------------- 1 file changed, 165 insertions(+), 153 deletions(-) diff --git a/R/authors_georef.R b/R/authors_georef.R index 666f4bf..a4264f8 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -1,30 +1,24 @@ #' Extracts the lat and long for each address from authors_clean #' #' \code{authors_georef} This function takes the final author list from -#' refine_authors, and calculates the lat long of the addresses. -#' It does this by feeding the addresses into data science toolkit. -#' In order to maximize effectiveness and mitigate errors in parsing addresses -#' We run this multiple times creating addresses in different ways -#' in hopes that the google georeferencing API can recognize an address -#' 1st. University, city, zipcode, country -#' 2nd. City, zipcode, country -#' 3rd. city, country -#' 4th. University, country +#' refine_authors, and calculates the lat long of the city, country, and postal +#' code (for USA addresses) or city and country (for addresses outside the USA). #' -#' The output is a list with three data.frames -#' \code{addresses} All info from 'refine_authors' plus new columns with -#' lat & long. It includes ALL addresses, including those -#' that could not be geocoded. -#' \code{missing_addresses} A data frame of the addresses that could +#' The output is a list of three data.frames +#' \code{addresses} All info from 'refine_authors' plus new columns with +#' lat & long. It includes ALL addresses, including those that could not +#' be geocoded. +#' \code{missing_addresses} A data frame of the addresses that could #' NOT be geocoded. -#' \code{no_missing_addresses} the \code{addresses} data frame with ONLY the +#' \code{no_missing_addresses} the \code{addresses} data frame with ONLY the #' addresses that were geocoded. #' #' @param data dataframe from `authors_refine()` #' @param address_column name of column in quotes where the addresses are -#' @param google_api if FALSE georeferencing is carried out with the -#' tidygeocoder package (geocode() with method = 'osm'). if TRUE geocoding -#' is done with the google maps API. Defaults to FALSE. +#' @param google_api if `google_api = FALSE` georeferencing is carried out with +#' the `tidygeocoder` package (option `geocode()` with `method = 'osm'`). +#' If `google_api = TRUE`, then geocoding is done with the Google Maps API. +#' Defaults to `FALSE`. #' @importFrom ggmap geocode #' #' @examples @@ -36,19 +30,18 @@ authors_georef <- function( data, address_column = "address", - google_api) { - -if (google_api == TRUE) { - pt1 <- ("Attention: You have chosen to geocode with the GOOGLE API.\n") - pt2 <- ("This is NOT a free service.\n") - pt3 <- ("Please refer to Google's current billing rates & usage limits.\n") + google_api = FALSE) { + if (google_api == TRUE) { + pt1 <- ("Attention: You have chosen to geocode with the GOOGLE API.\n") + pt2 <- ("This is NOT a free service.\n") + pt3 <- ("Please refer to Google's current billing rates & usage limits.\n") + + message(paste(pt1, pt2, pt3, sep = "")) + rm(pt1, pt2, pt3) - message(paste(pt1, pt2, pt3, sep = "")) - rm(pt1, pt2, pt3) - options(ggmap = list(display_api_key = FALSE)) - + if (!is.character(data$address)) { stop("Address columns are not characters, please change to characters and try again") @@ -227,134 +220,153 @@ if (google_api == TRUE) { # reset ggmaps option to TRUE. This only until the ggmaps gets fixed on.exit(options(ggmap = list(display_api_key = TRUE))) return(outputlist) - -} else { - pt1 <- ("You are Geocoding with OpenStreetMap.\n") - pt2 <- ("This proceeds at a rate of 1 address/second.\n") - pt3 <- ("For large data sets: OSM requests that you consider downloading\n") - pt4 <- ("the complete database to query locally instead of using the API.\n") - pt5 <- ("See the Refsplitr vignette for more information.\n") - message(paste(pt1, pt2, pt3, pt4, pt5, sep = "")) - rm(pt1, pt2, pt3, pt4, pt5) - - - - if (!is.character(data$address)) { - stop("Address columns are not characters, + } else { + pt1 <- ("You are Geocoding with OpenStreetMap.\n") + pt2 <- ("This proceeds at a rate of 1 address/second.\n") + pt3 <- ("For large data sets: OSM requests that you consider downloading\n") + pt4 <- ("the complete database to query locally instead of using the API.\n") + pt5 <- ("See the Refsplitr vignette for more information.\n") + message(paste(pt1, pt2, pt3, pt4, pt5, sep = "")) + rm(pt1, pt2, pt3, pt4, pt5) + + + + if (!is.character(data$address)) { + stop("Address columns are not characters, please change to characters and try again") - } - # a_df <- data[, c( - # "university", "city", "state", "country", - # "postal_code", "authorID", "address" - # )] - - a_df <- data[, c( - "city", "state", "country", - "postal_code", "authorID" - )] - a_df$country[a_df$country=="could not be extracted"]<-NA - a_df$state[a_df$state=="no state"]<-NA - a_df<-a_df[!is.na(a_df$country),] - # select the following columns from the fll dataframe - # a_df<-("authorID", "city","state","postal_code","country") - a_df$addr <- NA - - -a_df$addr <- ifelse(a_df$country == "usa", - ifelse(!is.na(a_df$state), - ifelse(!is.na(a_df$postal_code), - paste(a_df$city, a_df$state, a_df$postal_code, a_df$country, sep = ","), - paste(a_df$city, a_df$state, a_df$country, sep = ",")), - ifelse(!is.na(a_df$postal_code), - paste(a_df$city, a_df$postal_code, a_df$country, sep = ","), - paste(a_df$city, a_df$country, sep = ","))), - paste(a_df$city, a_df$country, sep = ",")) - - - # - # a_df$addr <- ifelse(is.na(a_df$state), - # paste(a_df$city, a_df$country, sep = ","), - # paste(a_df$city, a_df$state, a_df$country, sep = ",") - # ) - - a_df$addr <- ifelse(a_df$country == "Could not be extracted", - NA, - a_df$addr - ) - to_georef_df <- a_df$addr - # Find unique values of the 'id' column and keep all other columns - - - to_georef_df <- unique(a_df$addr) - to_georef_df <- as.data.frame(to_georef_df) - colnames(to_georef_df) <- c("addr") - - # to_georef_df <- na.omit(to_georef_df) - # FOR TESTING ONLY: - # library(tidygeocoder) to_georef_df<-to_georef_df %>% sample_n(500) - to_georef_df <- to_georef_df |> tidygeocoder::geocode(addr, - method = "osm", - lat = latitude, long = longitude - ) - no_latlon <- to_georef_df[is.na(to_georef_df$latitude), ] - perc_missing <- (nrow(no_latlon) / nrow(to_georef_df)) * 100 - - pt1<-c(paste("Unable to georef ", - round(perc_missing, 2), "% of author addresses.\n", sep = "")) - pt2<- c("Check `outputlist$missing_addresses` to see which ones.\n") - message(paste(pt1, pt2, sep = "")) - rm(pt1,pt2,perc_missing) - - # These get merged back into the original - a_df <- - merge( - to_georef_df[, c( - "addr","latitude", "longitude" - )], - a_df, - by = "addr", all.y = TRUE + } + # a_df <- data[, c( + # "university", "city", "state", "country", + # "postal_code", "authorID", "address" + # )] + + a_df <- data[, c( + "city", "state", "country", + "postal_code", "authorID" + )] + a_df$country[a_df$country == "could not be extracted"] <- NA + a_df$state[a_df$state == "no state"] <- NA + a_df <- a_df[!is.na(a_df$country), ] + # select the following columns from the fll dataframe + # a_df<-("authorID", "city","state","postal_code","country") + a_df$addr <- NA + + + a_df$addr <- ifelse(a_df$country == "usa", + ifelse(!is.na(a_df$state), + ifelse(!is.na(a_df$postal_code), + paste(a_df$city, + a_df$state, + a_df$postal_code, + a_df$country, + sep = "," + ), + paste(a_df$city, + a_df$state, + a_df$country, + sep = "," + ) + ), + ifelse(!is.na(a_df$postal_code), + paste(a_df$city, + a_df$postal_code, + a_df$country, + sep = "," + ), + paste(a_df$city, + a_df$country, + sep = "," + ) + ) + ), + paste(a_df$city, + a_df$country, + sep = "," + ) ) - data <- - merge( - a_df[, c( - "authorID","latitude","longitude" - )], - data, - by = c("authorID"), all.y = TRUE + + # + # a_df$addr <- ifelse(is.na(a_df$state), + # paste(a_df$city, a_df$country, sep = ","), + # paste(a_df$city, a_df$state, a_df$country, sep = ",") + # ) + + a_df$addr <- ifelse(a_df$country == "Could not be extracted", + NA, + a_df$addr + ) + to_georef_df <- a_df$addr + + # Find unique values of the 'id' column and keep all other columns + + + to_georef_df <- unique(a_df$addr) + to_georef_df <- as.data.frame(to_georef_df) + colnames(to_georef_df) <- c("addr") + + # to_georef_df <- na.omit(to_georef_df) + + to_georef_df <- to_georef_df |> tidygeocoder::geocode(addr, + method = "osm", + lat = latitude, long = longitude ) - - names(data)[names(data) == 'latitude'] <- 'lat' - names(data)[names(data) == 'longitude'] <- 'lon' - - - no_georef <- data[is.na(data$lat), ] - - addresses<-data - missingaddresses <- data[is.na(data$lat), ] - addresses$lat <- unlist(data$lat) - addresses$lon <- unlist(data$lon) - - outputlist <- list() - outputlist$addresses <- addresses - outputlist$missing_addresses <- missingaddresses - outputlist$no_missing_addresses <- addresses[!is.na(addresses$lat), ] - - - - - - pt1 <- ("The output is a list with three data.frames:\n") - pt2 <- ("outputlist$addresses: all info from 'refine_authors' - plus new columns with lat & long. It includes ALL addresses, + no_latlon <- to_georef_df[is.na(to_georef_df$latitude), ] + perc_missing <- (nrow(no_latlon) / nrow(to_georef_df)) * 100 + + pt1 <- c(paste("Unable to georef ", + round(perc_missing, 2), "% of author addresses.\n", + sep = "" + )) + pt2 <- c("Check `outputlist$missing_addresses` to see which ones.\n") + message(paste(pt1, pt2, sep = "")) + rm(pt1, pt2, perc_missing) + + # These get merged back into the original + a_df <- + merge( + to_georef_df[, c( + "addr", "latitude", "longitude" + )], + a_df, + by = "addr", all.y = TRUE + ) + + data <- + merge( + a_df[, c( + "authorID", "latitude", "longitude" + )], + data, + by = c("authorID"), all.y = TRUE + ) + + names(data)[names(data) == "latitude"] <- "lat" + names(data)[names(data) == "longitude"] <- "lon" + + + no_georef <- data[is.na(data$lat), ] + + addresses <- data + missingaddresses <- data[is.na(data$lat), ] + addresses$lat <- unlist(data$lat) + addresses$lon <- unlist(data$lon) + + outputlist <- list() + outputlist$addresses <- addresses + outputlist$missing_addresses <- missingaddresses + outputlist$no_missing_addresses <- addresses[!is.na(addresses$lat), ] + pt1 <- ("The output is a list with three data.frames:\n") + pt2 <- ("outputlist$addresses: all info from 'refine_authors' + plus new `lat` & `long` columns. It includes ALL addresses, including those that could not be geocoded. \n") - pt3 <- ("outputlist$missing_addresses: a data frame of the addresses that - could NOT be geocoded.\n") - pt4 <- ("outputlist$no_missing_addresses: a data frame with ONLY the addresses - that were geocoded. \n") - message(paste(pt1, pt2, pt3, pt4,sep = "")) - rm(pt1, pt2, pt3, pt4) - - return(outputlist) + pt3 <- ("outputlist$missing_addresses: Includes only the addresses that + could NOT be geocoded.\n") + pt4 <- ("outputlist$no_missing_addresses: Includes only the addresses + that WERE geocoded. \n") + message(paste(pt1, pt2, pt3, pt4, sep = "")) + rm(pt1, pt2, pt3, pt4) + + return(outputlist) + } } -} \ No newline at end of file From 8329f3f9a74bd43f3dc0197dd7a210b878966361 Mon Sep 17 00:00:00 2001 From: embruna Date: Sat, 22 Mar 2025 14:25:53 -0400 Subject: [PATCH 18/34] cleanup and formatting --- R/authors_address.R | 496 +++++++++++++++++++++++++++++++------------- 1 file changed, 350 insertions(+), 146 deletions(-) diff --git a/R/authors_address.R b/R/authors_address.R index bbc68e0..3c67a14 100644 --- a/R/authors_address.R +++ b/R/authors_address.R @@ -1,5 +1,10 @@ #' Parses out address information and splits it into its respective parts. -#' This is an internal function used by \code{authors_clean} +#' This is an internal function used by \code{authors_clean}. Note that parsing +#' addresses is surprisingly difficult, largely because there is no standard +#' format across journals/countries for how there are reported. For example: +#' Ex 1) some journals use dept, univ, city, state, postal code, country +#' Ex 2) others use univ, dept, country, postal code. +#' Ex 3) Postal code is sometimes in the same cell as country, other times not. #' #' \code{authors_address} This function takes the output from #' \code{references_read} and pulls out address information. Splitting it into @@ -8,7 +13,6 @@ #' @param ID the authorID #' @noRd authors_address <- function(addresses, ID) { - addresses <- tolower(addresses) message("\nSplitting addresses\n") @@ -40,7 +44,11 @@ authors_address <- function(addresses, ID) { # correct countries ------------------------------------------------------- - # correct or update names of some countries to make it possible to georef + # format or update names of some countries to make it possible to georef + # the WOS often uses abbreviations, this standardizes them in a way that + # `tidygeocoder` can use them. It also updates country names that have changed + # (e.g., czechia is new name for czech republic). In some cases no changes + # were made (e.g., united arab rep = current country name depends on city). # Define the function correct_countries <- function(my_list, replacements) { @@ -57,16 +65,14 @@ authors_address <- function(addresses, ID) { } return(my_list) } - # NB: also updated country names. - # czechia = new name for czech republic - # united arab rep = current country name depends on city + replacements <- c( "austl" = "australia", "c z" = "czechia", "cz" = "czechia", "czech republic" = "czechia", "fed rep ger" = "germany", - "columbia" = "colombia", + "columbia" = "colombia", # a depressingly common mistake "peoples r china" = "china", "u arab emirates" = "united arab emirates", "mongol peo rep" = "mongolia", @@ -219,12 +225,17 @@ authors_address <- function(addresses, ID) { "usa", a_df$country ) - a_df$state <- ifelse(a_df$country == "usa" & grepl("[a-z]{2} [0-9]{5}", a_df$state), - gsub("[[:digit:]]{5}", "", a_df$state), a_df$state + a_df$state <- ifelse(a_df$country == "usa" & grepl( + "[a-z]{2} [0-9]{5}", + a_df$state + ), + gsub("[[:digit:]]{5}", "", a_df$state), + a_df$state ) a_df$state <- ifelse(a_df$country == "usa" & grepl(" usa", a_df$state), - gsub(" usa", "", a_df$state), a_df$state + gsub(" usa", "", a_df$state), + a_df$state ) @@ -245,8 +256,12 @@ authors_address <- function(addresses, ID) { - a_df$city <- ifelse(a_df$country == "usa" & grepl("[a-z]{2} [0-9]{5}", a_df$city), - a_df$city2, a_df$city + a_df$city <- ifelse(a_df$country == "usa" & grepl( + "[a-z]{2} [0-9]{5}", + a_df$city + ), + a_df$city2, + a_df$city ) @@ -262,10 +277,16 @@ authors_address <- function(addresses, ID) { "usa", a_df$country ) a_df$postal_code <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$postal_code), - gsub("[a-z]", "", a_df$postal_code), a_df$postal_code + gsub("[a-z]", "", a_df$postal_code), + a_df$postal_code ) a_df$state <- ifelse(a_df$country == "usa" & grepl(pattern, a_df$state), - gsub("[0-9]", "", a_df$postal_code), a_df$state + gsub( + "[0-9]", + "", + a_df$postal_code + ), + a_df$state ) @@ -273,48 +294,72 @@ authors_address <- function(addresses, ID) { # BRAZIL clean-up --------------------------------------------------------- a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2, - a_df$city, a_df$state + a_df$city, + a_df$state ) a_df$city <- ifelse(a_df$country == "brazil" & nchar(a_df$city) == 2, - a_df$city2, a_df$city + a_df$city2, + a_df$city ) a_df$city2 <- ifelse(a_df$country == "brazil" & a_df$city == a_df$city2, - NA, a_df$city2 + NA, + a_df$city2 ) a_df$postal_code <- ifelse(a_df$country == "brazil" & is.na(a_df$postal_code), - a_df$city, a_df$postal_code + a_df$city, + a_df$postal_code ) a_df$state <- ifelse(a_df$country == "brazil" & nchar(a_df$state) > 2, - NA, a_df$state + NA, + a_df$state ) a_df$postal_code <- ifelse(a_df$country == "brazil", - gsub("[A-Za-z]", "", a_df$postal_code), + gsub( + "[A-Za-z]", + "", + a_df$postal_code + ), a_df$postal_code ) a_df$postal_code <- ifelse(a_df$country == "brazil", - gsub("[-]", "", a_df$postal_code), + gsub( + "[-]", + "", + a_df$postal_code + ), a_df$postal_code ) a_df$city <- ifelse(a_df$country == "brazil", - gsub("br-", "", a_df$city), + gsub( + "br-", + "", + a_df$city + ), a_df$city ) a_df$city <- ifelse(a_df$country == "brazil", - gsub("[0-9]", "", a_df$city), + gsub( + "[0-9]", + "", + a_df$city + ), a_df$city ) a_df$state <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), - a_df$city, a_df$state + a_df$city, + a_df$state ) a_df$postal_code <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), - a_df$city2, a_df$postal_code + a_df$city2, + a_df$postal_code ) a_df$city <- ifelse(a_df$country == "brazil" & grepl("br-", a_df$city2), - a_df$city2, a_df$city + a_df$city2, + a_df$city ) @@ -343,25 +388,38 @@ authors_address <- function(addresses, ID) { # Define city-to-state mapping city_state_mapping <- data.frame( - city = c("ribeirao preto", "sao carlos", "rio claro", "sorocaba", "seropedica", "rio de janeiro", "rio janeiro", "sao paulo"), + city = c( + "ribeirao preto", "sao carlos", "rio claro", "sorocaba", + "seropedica", "rio de janeiro", "rio janeiro", "sao paulo" + ), state = c("sp", "sp", "sp", "sp", "rj", "rj", "rj", "sp"), stringsAsFactors = FALSE ) # Match cities and states for (i in 1:nrow(city_state_mapping)) { - a_df$city <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), - city_state_mapping$city[i], a_df$city + a_df$city <- ifelse(a_df$country == "brazil" & + grepl(city_state_mapping$city[i], + a_df$state, + ignore.case = TRUE + ), + city_state_mapping$city[i], a_df$city ) - a_df$state <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$state, ignore.case = TRUE), - city_state_mapping$state[i], a_df$state + a_df$state <- ifelse(a_df$country == "brazil" & + grepl(city_state_mapping$city[i], + a_df$state, + ignore.case = TRUE + ), + city_state_mapping$state[i], a_df$state ) } # Match cities and states for (i in 1:nrow(city_state_mapping)) { - a_df$state <- ifelse(a_df$country == "brazil" & grepl(city_state_mapping$city[i], a_df$city, ignore.case = TRUE), - city_state_mapping$state[i], a_df$state + a_df$state <- ifelse(a_df$country == "brazil" & + grepl(city_state_mapping$city[i], a_df$city, ignore.case = TRUE), + city_state_mapping$state[i], + a_df$state ) } @@ -397,30 +455,44 @@ authors_address <- function(addresses, ID) { # CANADA clean-up --------------------------------------------------------- a_df$state <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2, - a_df$city, a_df$state + a_df$city, + a_df$state ) a_df$city <- ifelse(a_df$country == "canada" & nchar(a_df$city) == 2, - NA, a_df$city + NA, + a_df$city ) a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), - a_df$city2, a_df$postal_code + a_df$city2, + a_df$postal_code ) a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city2), - a_df$city2, a_df$postal_code + a_df$city2, + a_df$postal_code ) a_df$state <- ifelse(a_df$country == "canada" & a_df$city2 == a_df$state, NA, a_df$state ) - a_df$city <- ifelse(a_df$country == "canada", a_df$city2, a_df$city) - a_df$city2 <- ifelse(a_df$country == "canada", NA, a_df$city2) + a_df$city <- ifelse(a_df$country == "canada", + a_df$city2, + a_df$city + ) + a_df$city2 <- ifelse(a_df$country == "canada", + NA, + a_df$city2 + ) a_df$city <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$city), - gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b", "", a_df$city), + gsub( + "\\b(\\w{3})\\b \\b(\\w{3})\\b", + "", + a_df$city + ), a_df$city ) @@ -430,13 +502,25 @@ authors_address <- function(addresses, ID) { ) a_df$state <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$state), - gsub("\\b(\\w{3})\\b \\b(\\w{3})\\b", "", a_df$state), + gsub( + "\\b(\\w{3})\\b \\b(\\w{3})\\b", + "", + a_df$state + ), a_df$state ) - a_df$postal_code <- ifelse(a_df$country == "canada" & grepl("\\b(\\w{2,20})\\b \\b(\\w{3})\\b \\b(\\w{3})\\b", a_df$postal_code), - gsub("\\b(\\w{1,2}|\\w{4,})\\b", "", a_df$postal_code), + a_df$postal_code <- ifelse(a_df$country == "canada" & + grepl( + "\\b(\\w{2,20})\\b \\b(\\w{3})\\b \\b(\\w{3})\\b", + a_df$postal_code + ), + gsub( + "\\b(\\w{1,2}|\\w{4,})\\b", + "", a_df$postal_code + ), + a_df$postal_code ) a_df[] <- lapply(a_df, trimws) @@ -444,7 +528,10 @@ authors_address <- function(addresses, ID) { # TODO: a few postal codes still have letters from city - a_df$postal_code <- ifelse(a_df$country == "canada", gsub(" ", "", a_df$postal_code), a_df$postal_code) + a_df$postal_code <- ifelse(a_df$country == "canada", + gsub(" ", "", a_df$postal_code), + a_df$postal_code + ) # UK clean-up ------------------------------------------------------------- @@ -457,13 +544,16 @@ authors_address <- function(addresses, ID) { # a_df$postal_code) a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city2), - a_df$city2, a_df$postal_code + a_df$city2, + a_df$postal_code ) a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$state), - a_df$state, a_df$postal_code + a_df$state, + a_df$postal_code ) a_df$postal_code <- ifelse(a_df$country %in% uk & grepl(pattern, a_df$city), - a_df$city, a_df$postal_code + a_df$city, + a_df$postal_code ) a_df$postal_code <- ifelse(a_df$country %in% uk, @@ -480,9 +570,18 @@ authors_address <- function(addresses, ID) { ) - a_df$state <- ifelse(a_df$country == "england", a_df$city, a_df$state) - a_df$city <- ifelse(a_df$country == "england", NA, a_df$city) - a_df$city <- ifelse(a_df$country == "england", a_df$postal_code, a_df$city) + a_df$state <- ifelse(a_df$country == "england", + a_df$city, + a_df$state + ) + a_df$city <- ifelse(a_df$country == "england", + NA, + a_df$city + ) + a_df$city <- ifelse(a_df$country == "england", + a_df$postal_code, + a_df$city + ) a_df$city <- ifelse(a_df$country == "england", gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), a_df$city @@ -490,13 +589,41 @@ authors_address <- function(addresses, ID) { # TODO: england still needs work - a_df$state <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", NA, a_df$state) - a_df$state <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales" & is.na(a_df$state), a_df$city, a_df$state) - a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", a_df$postal_code, a_df$city) - a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales" & is.na(a_df$city), a_df$city2, a_df$city) - a_df$city <- ifelse(a_df$country == "scotland" | a_df$country == "northern ireland" | a_df$country == "wales", - gsub("\\b\\w*\\d\\w*\\b", "", a_df$city), + a_df$state <- ifelse(a_df$country == "scotland" | + a_df$country == "northern ireland" | + a_df$country == "wales", + NA, + a_df$state + ) + a_df$state <- ifelse(a_df$country == "scotland" | + a_df$country == "northern ireland" | + a_df$country == "wales" & + is.na(a_df$state), + a_df$city, + a_df$state + ) + a_df$city <- ifelse(a_df$country == "scotland" | + a_df$country == "northern ireland" | + a_df$country == "wales", + a_df$postal_code, + a_df$city + ) + a_df$city <- ifelse(a_df$country == "scotland" | + a_df$country == "northern ireland" | + a_df$country == "wales" & + is.na(a_df$city), + a_df$city2, + a_df$city + ) + a_df$city <- ifelse(a_df$country == "scotland" | + a_df$country == "northern ireland" | + a_df$country == "wales", + gsub( + "\\b\\w*\\d\\w*\\b", + "", a_df$city + ), + a_df$city ) @@ -541,28 +668,35 @@ authors_address <- function(addresses, ID) { a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city2), - a_df$city2, a_df$postal_code + a_df$city2, + a_df$postal_code ) a_df$postal_code <- ifelse(a_df$country == "india" & grepl("[0-9]{5,10}", a_df$city), - a_df$city, a_df$postal_code + a_df$city, + a_df$postal_code ) a_df$city2 <- ifelse(a_df$country == "india" & a_df$state == a_df$city2, - a_df$state, a_df$city2 + a_df$state, + a_df$city2 ) a_df$state <- ifelse(a_df$country == "india", NA, a_df$state) a_df$state <- ifelse(a_df$country == "india" & is.na(a_df$postal_code), - a_df$city, a_df$state + a_df$city, + a_df$state ) a_df$city <- ifelse(a_df$country == "india" & a_df$state == a_df$city, - NA, a_df$city + NA, + a_df$city ) a_df$city <- ifelse(a_df$country == "india" & grepl("[0-9]{4,10}", a_df$postal_code), - a_df$postal_code, a_df$city + a_df$postal_code, + a_df$city ) a_df$city <- ifelse(a_df$country == "india" & is.na(a_df$city), - a_df$city2, a_df$city + a_df$city2, + a_df$city ) @@ -575,8 +709,10 @@ authors_address <- function(addresses, ID) { a_df$city ) - a_df$city <- ifelse(a_df$country == "india" & (grepl("delhi", a_df$city) | grepl("delhi", a_df$state)), - "new delhi", a_df$city + a_df$city <- ifelse(a_df$country == "india" & + (grepl("delhi", a_df$city) | grepl("delhi", a_df$state)), + "new delhi", + a_df$city ) @@ -586,32 +722,46 @@ authors_address <- function(addresses, ID) { - a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$city2), - a_df$city2, a_df$postal_code + a_df$postal_code <- ifelse(a_df$country == "china" & + grepl("[0-9]{5,10}", a_df$city2), + a_df$city2, + a_df$postal_code ) - a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$city), - a_df$city, a_df$postal_code + a_df$postal_code <- ifelse(a_df$country == "china" & + grepl("[0-9]{5,10}", a_df$city), + a_df$city, + a_df$postal_code ) - a_df$postal_code <- ifelse(a_df$country == "china" & grepl("[0-9]{5,10}", a_df$state), - a_df$state, a_df$postal_code + a_df$postal_code <- ifelse(a_df$country == "china" & + grepl("[0-9]{5,10}", a_df$state), + a_df$state, + a_df$postal_code ) a_df$city2 <- ifelse(a_df$country == "china" & a_df$state == a_df$city2, - a_df$state, a_df$city2 + a_df$state, + a_df$city2 + ) + a_df$state <- ifelse(a_df$country == "china", + NA, + a_df$state ) - a_df$state <- ifelse(a_df$country == "china", NA, a_df$state) a_df$state <- ifelse(a_df$country == "china" & is.na(a_df$postal_code), - a_df$city, a_df$state + a_df$city, + a_df$state ) a_df$city <- ifelse(a_df$country == "china" & a_df$state == a_df$city, - NA, a_df$city + NA, + a_df$city ) a_df$city <- ifelse(a_df$country == "china" & grepl("[0-9]{4,10}", a_df$postal_code), - a_df$postal_code, a_df$city + a_df$postal_code, + a_df$city ) a_df$city <- ifelse(a_df$country == "china" & is.na(a_df$city), - a_df$city2, a_df$city + a_df$city2, + a_df$city ) @@ -627,7 +777,8 @@ authors_address <- function(addresses, ID) { a_df$city <- ifelse(a_df$country == "china" & grepl("beijing", a_df$state), - "beijing", a_df$city + "beijing", + a_df$city ) @@ -644,33 +795,38 @@ authors_address <- function(addresses, ID) { pattern <- paste(to_delete, collapse = "|") # Apply the ifelse function to update a_df$city <- ifelse(a_df$country == "china" & - grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), - NA, a_df$city) - a_df$city<- ifelse(a_df$country=="china" & is.na(a_df$city), - a_df$state, a_df$city) - + grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), + NA, a_df$city + ) + a_df$city <- ifelse(a_df$country == "china" & is.na(a_df$city), + a_df$state, a_df$city + ) + a_df[] <- lapply(a_df, trimws) - + # This verifies that what is in `city` is actually a city # (or at least that what is in `city` is NOT a province) - chn_states <- c("guangdong", "shandong", "henan", "jiangsu", "sichuan", - "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", - "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", - "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", - "gansu", "inner mongolia", "jilin", "hainan", "ningxia", - "qinghai", "tibet", "macao") - pattern <- paste(to_delete, collapse = "|") - a_df$city<- ifelse(a_df$country=="china" & - grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), - NA, a_df$city) - - + chn_states <- c( + "guangdong", "shandong", "henan", "jiangsu", "sichuan", + "hebei", "hunan", "zhejiang", "anhui", "hubei", "guangxi", + "yunnan", "jiangxi", "liaoning", "fujian", "shaanxi", + "guizhou", "shanxi", "chongqing", "heilongjiang", "xinjiang", + "gansu", "inner mongolia", "jilin", "hainan", "ningxia", + "qinghai", "tibet", "macao" + ) + pattern <- paste(to_delete, collapse = "|") + a_df$city <- ifelse(a_df$country == "china" & + grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE), + NA, a_df$city + ) + + # pc is letters dash numbers ---------------------------------------------- @@ -682,23 +838,26 @@ authors_address <- function(addresses, ID) { ) a_df$postal_code <- ifelse(grepl(pattern, a_df$state), - a_df$state, a_df$postal_code + a_df$state, + a_df$postal_code ) a_df$state <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), - a_df$city, a_df$state + a_df$city, + a_df$state ) a_df$city <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$postal_code), - a_df$postal_code, a_df$city + a_df$postal_code, + a_df$city ) a_df$city2 <- ifelse((grepl(pattern, a_df$postal_code) & a_df$city2 == a_df$city), NA, a_df$city2 ) - + a_df$city <- ifelse(grepl(pattern, a_df$city), gsub("[0-9]", "", a_df$city), @@ -779,24 +938,6 @@ authors_address <- function(addresses, ID) { # Final clean-up of some US cities and states ----------------------------- - # Remove panama canal zone from usa states (for stri) - a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "cz"), - NA, a_df$state - ) - - # armed forces & diplomatic - a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "aa"), - NA, a_df$state - ) - - a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "apo"), - NA, a_df$city - ) - - a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "dpo"), - NA, a_df$city - ) - a_df$city <- ifelse(a_df$city == "university pk", "university park", a_df$city @@ -830,13 +971,15 @@ authors_address <- function(addresses, ID) { a_df$city <- ifelse(grepl("sioux ctr", a_df$city), - (sub("sioux ctr", "sioux city", a_df$city)), a_df$city + (sub("sioux ctr", "sioux city", a_df$city)), + a_df$city ) a_df$city <- ifelse(grepl("sioux ctr", a_df$city), - (sub("sioux ctr", "sioux city", a_df$city)), a_df$city + (sub("sioux ctr", "sioux city", a_df$city)), + a_df$city ) @@ -975,20 +1118,48 @@ authors_address <- function(addresses, ID) { ) pattern <- paste(us_state_abbreviations_lower, collapse = "|") a_df$country_list <- country_list - a_df$state <- ifelse((a_df$country == "usa" & is.na(a_df$state) & grepl(pattern, a_df$address, ignore.case = TRUE, perl = TRUE)), - a_df$country_list, a_df$state + a_df$state <- ifelse((a_df$country == "usa" & + is.na(a_df$state) & + grepl(pattern, a_df$address, ignore.case = TRUE, perl = TRUE)), + a_df$country_list, + a_df$state ) a_df$state <- ifelse((a_df$country == "usa" & grepl("[[:digit:]]", a_df$state)), - gsub("[[:digit:]]", "", a_df$state), a_df$state + gsub("[[:digit:]]", "", a_df$state), + a_df$state ) a_df$state <- ifelse((a_df$country == "usa" & grepl("usa", a_df$state)), - gsub("usa", "", a_df$state), a_df$state + gsub("usa", "", a_df$state), + a_df$state ) a_df$state <- trimws(a_df$state, which = "both") + + + # Remove panama canal zone from usa states (for stri) + a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "cz"), + NA, + a_df$state + ) + + # armed forces & diplomatic + a_df$state <- ifelse((a_df$country == "usa" & a_df$state == "aa"), + NA, + a_df$state + ) + + a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "apo"), + NA, + a_df$city + ) + + a_df$city <- ifelse((a_df$country == "usa" & a_df$state == "dpo"), + NA, + a_df$city + ) # Japanese prefectures & cities sometimes swapped in address -------------- @@ -1002,13 +1173,17 @@ authors_address <- function(addresses, ID) { "assoc", "forest", "corp" ) pattern <- paste(to_delete, collapse = "|") - a_df$city2 <- ifelse((a_df$country == "japan" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA, a_df$city2 + a_df$city2 <- ifelse((a_df$country == "japan" & + grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA, + a_df$city2 ) # Remove any with numbers - a_df$city2 <- ifelse((a_df$country == "japan" & grepl("[[:digit:]]", a_df$city2)), - NA, a_df$city2 + a_df$city2 <- ifelse((a_df$country == "japan" & + grepl("[[:digit:]]", a_df$city2)), + NA, + a_df$city2 ) japan_prefectures <- c( @@ -1025,21 +1200,29 @@ authors_address <- function(addresses, ID) { pattern <- paste(japan_prefectures, collapse = "|") - a_df$state <- ifelse((a_df$country == "japan" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), - a_df$city, a_df$state + a_df$state <- ifelse((a_df$country == "japan" & + grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + a_df$city, + a_df$state ) # This removes all special regions of a city like tokyo from city2 - a_df$city2 <- ifelse((a_df$country == "japan" & grepl(" ku", a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA, a_df$city2 + a_df$city2 <- ifelse((a_df$country == "japan" & + grepl(" ku", a_df$city2, + ignore.case = TRUE, perl = TRUE + )), + NA, + a_df$city2 ) # replace from city with city2 EXCEPT in cases where no state (and therefore # city is correct) and where no city2 (otherwise would bring in NA) - a_df$city <- ifelse((a_df$country == "japan" & !(is.na(a_df$state)) & !(is.na(a_df$city2))), - a_df$city2, a_df$city + a_df$city <- ifelse((a_df$country == "japan" & + !(is.na(a_df$state)) & !(is.na(a_df$city2))), + a_df$city2, + a_df$city ) @@ -1048,8 +1231,10 @@ authors_address <- function(addresses, ID) { - a_df$city <- ifelse((a_df$country == "scotland" & grepl("univ ", a_df$city, ignore.case = TRUE, perl = TRUE)), - gsub("univ ", "", a_df$city), a_df$city + a_df$city <- ifelse((a_df$country == "scotland" & + grepl("univ ", a_df$city, ignore.case = TRUE, perl = TRUE)), + gsub("univ ", "", a_df$city), + a_df$city ) to_delete <- c( @@ -1058,8 +1243,10 @@ authors_address <- function(addresses, ID) { ) pattern <- paste(to_delete, collapse = "|") - a_df$city <- ifelse((a_df$country == "scotland" & grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), - NA, a_df$city + a_df$city <- ifelse((a_df$country == "scotland" & + grepl(pattern, a_df$city, ignore.case = TRUE, perl = TRUE)), + NA, + a_df$city ) @@ -1079,18 +1266,23 @@ authors_address <- function(addresses, ID) { "ndorms", "nat hist", "hlth", " ave", "council", "unit", "nerc", "nat res" ) pattern <- paste(to_delete, collapse = "|") - a_df$city2 <- ifelse((a_df$country == "england" & grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), - NA, a_df$city2 + + a_df$city2 <- ifelse((a_df$country == "england" & + grepl(pattern, a_df$city2, ignore.case = TRUE, perl = TRUE)), + NA, + a_df$city2 ) a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)), - a_df$city2, a_df$city + a_df$city2, + a_df$city ) a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & grepl("london", a_df$address, ignore.case = TRUE, perl = TRUE), - "london", a_df$city + "london", + a_df$city ) a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & @@ -1100,7 +1292,8 @@ authors_address <- function(addresses, ID) { a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & grepl("oxford", a_df$address, ignore.case = TRUE, perl = TRUE), - "oxford", a_df$city + "oxford", + a_df$city ) a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & @@ -1110,20 +1303,31 @@ authors_address <- function(addresses, ID) { a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & grepl("bristol", a_df$address, ignore.case = TRUE, perl = TRUE), - "bristol", a_df$city + "bristol", + a_df$city ) a_df$city <- ifelse((a_df$country == "england" & is.na(a_df$city)) & grepl("lancaster", a_df$address, ignore.case = TRUE, perl = TRUE), - "lancaster", a_df$city + "lancaster", + a_df$city ) - # delete columns used to 2x check ---------------------------------------- + +# final clean up to return ------------------------------------------------ + + + + # delete columns used to 2x check a_df$city2 <- NULL a_df$country_list <- NULL + + # replace blank with NA + + a_df[a_df == ""] <- NA # return output of function ----------------------------------------------- From d8214c722b4d95ff7b3ac3fb8d024c063e3d00a4 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 11:41:35 -0400 Subject: [PATCH 19/34] updating georeferencing info in vignette and readme --- R/authors_georef.R | 17 ++++++++++++----- R/references_read.R | 2 +- README.Rmd | 2 +- vignettes/refsplitr.Rmd | 17 +++++++++++------ 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/R/authors_georef.R b/R/authors_georef.R index a4264f8..c9c2e2a 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -33,11 +33,13 @@ authors_georef <- function( google_api = FALSE) { if (google_api == TRUE) { pt1 <- ("Attention: You have chosen to geocode with the GOOGLE API.\n") - pt2 <- ("This is NOT a free service.\n") - pt3 <- ("Please refer to Google's current billing rates & usage limits.\n") - - message(paste(pt1, pt2, pt3, sep = "")) - rm(pt1, pt2, pt3) + pt2 <- ("The number of free API calls in one month is limited.\n") + pt3 <- ("If the number of addresses being georeferenced exceeds \n") + pt4 <- ("this limit, Google WILL bill you for the difference.\n") + pt5 <- ("Please refer to Google's current billing rates & usage limits.\n") + + message(paste(pt1, pt2, pt3, pt4, pt5, sep = "")) + rm(pt1, pt2, pt3, pt4, pt5) options(ggmap = list(display_api_key = FALSE)) @@ -306,6 +308,11 @@ authors_georef <- function( colnames(to_georef_df) <- c("addr") # to_georef_df <- na.omit(to_georef_df) + + message(paste("Number of addresses being geocoded: ", + nrow(to_georef_df), sep = "")) + + to_georef_df <- to_georef_df |> tidygeocoder::geocode(addr, method = "osm", diff --git a/R/references_read.R b/R/references_read.R index 9154a8b..44f8412 100644 --- a/R/references_read.R +++ b/R/references_read.R @@ -13,7 +13,7 @@ #' @param include_all if FALSE only a subset of commonly used fields from references records are imported. #' If TRUE then all fields from the reference records are imported. Defaults to FALSE. #' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, FX, GA, J9, -#' LA, PA, PI, PN, PS, RID, SU, VR. +#' LA, PA, PI, PN, PS, RID, SU, VR, OA. #' @export references_read #' #' @examples diff --git a/README.Rmd b/README.Rmd index 0191441..ed79b38 100644 --- a/README.Rmd +++ b/README.Rmd @@ -35,7 +35,7 @@ There are four steps in the `refsplitr` package's workflow: 1. Importing and tidying Web of Science reference records (be sure to download records using the procedure in Appendix 1 of the [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html)) 2. Author name disambiguation and parsing of author addresses -3. Georeferencing of author institutions using either [`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/) (default; free) or the Google Maps API (paid; for additional details on how to register with Google prior to georeferencing see the [`ggmap`](https://github.com/dkahle/ggmap) repository and `refsplitr` [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html).) +3. Georeferencing of author institutions using either the [Nominatim](https://nominatim.org/) service, which uses OpenStreetMap data and which `refsplitr` queries via the [`tidygeocoder`]((https://jessecambon.github.io/tidygeocoder/) package (default; free) _OR_ the [Data Science Toolkit](http://www.datasciencetoolkit.org/), which uses the Google maps API (limited number of free queries after which users must pay); for additional details on pricing information how to register with Google to use their API see the `refsplitr` [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html). 4. Data visualization The procedures required for these four steps,each of which is implemented with a simple command, are described in detail in the `refsplitr` [vignette](https://docs.ropensci.org/refsplitr/articles/refsplitr.html). An example of this workflow is provided below: diff --git a/vignettes/refsplitr.Rmd b/vignettes/refsplitr.Rmd index 16c68f7..fc8d9e3 100644 --- a/vignettes/refsplitr.Rmd +++ b/vignettes/refsplitr.Rmd @@ -255,21 +255,25 @@ example_a_refined_mod <- authors_refine(example_a_corrected, ### 2.3. Georeferencing author institutions -Users can georeference author's institutions (latitude & longitude) using the `authors_georef()` function. This function has has two arguments: +Users can georeference author institutions (latitude & longitude) using the `authors_georef()` function. This function has has three arguments: - **data**: The output created by `authors_refine()`. Must be an object. - **address_column**: A quoted character identifying the column name in which addresses are stored. This defaults to the `address` column from the `authors_refine()` output. -The output of `authors_georef()` is a list with three elements: (1) `addresses` contains all records, (2) `missing_addresses` contains the records that could not be georeferenced, and (3) `not_missing_addresses` contains only the records with georeferenced addresses. +- **google_api**: If `FALSE` the addresses will be referenced with the [`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/) package (free). If `TRUE` addresses will be georeferenced with the [Data Science Toolkit](http://www.datasciencetoolkit.org/), which uses the Google Maps API (paid after a threshold number of free queries). This defaults to `FALSE`. See **WARNINGS** below for additional details on pricing and howe to register for a Google Maps API key. + +The output of `authors_georef()` is a list with three elements: (1) `addresses` contains all records, (2) `missing_addresses` contains the records that could not be georeferenced, and (3) `not_missing_addresses` contains only the records with georeferenced addresses. The time required to georeference depends on the number of addresses being processed, the speed of the internet connection, and the processing power of the computer on which analyses are being conducted. #### **WARNINGS**: (1) The `authors_georef()` function requires address be data type = character. If importing a .csv file with the results of `authors_refine()` for processing with `authors_georef()`, be sure to include "stringsAsFactors = FALSE" in the `read.csv` command. -(2) The `authors_georef()` function parses addresses from the Web of Science reference sheet and then attempts to calculate the latitude and longitude for them with the [Data Science Toolkit](http://www.datasciencetoolkit.org/). The time required to do so depends on the number of addresses being processed, the speed of the internet connection, and the processing power of the computer on which analyses are being conducted. +(2) The `authors_georef()` function parses addresses from the Web of Science reference sheet and then uses one of two services to query the latitude and longitude of each address. The default option is the free [Nominatim](https://nominatim.org/) service, which uses OpenStreetMap (OSM) data and which `refsplitr` queries via the [`tidygeocoder`]((https://jessecambon.github.io/tidygeocoder/) package. Alternatively, users may elect to geocode via the [Data Science Toolkit](http://www.datasciencetoolkit.org/), which uses the Google maps API. This service allows a limited number of free queries (currently 10,000 per month) before charging users. _Many bibliometric projects will exceed this limit_. Please be sure to check the Google Maps [pricing information](https://mapsplatform.google.com/pricing/?utm_experiment=13102311) see if your research will do so and how much you will be billed for queries over this amount. You may be able to get free additional credits if you are based at a nonprofit; Google also offers faculty free credits via their teaching program. Additional information can be found [here](https://developers.google.com/maps/billing-and-pricing/public-programs?hl=en&_gl=1*1cwzikc*_ga*MTAxMzA3OTgzMy4xNzQyNjY5MzU4*_ga_NRWSTWS78N*MTc0MjgyOTEzNS4yLjEuMTc0MjgyOTI4Mi4wLjAuMA). + +(3) To use the Google Maps option you will need a Google Maps API key. Details on how to register for one are available on the [`ggmap`](https://github.com/dkahle/ggmap?tab=readme-ov-file#google-maps-api-key) repository. -(3) This version of `refsplitr` (v1.0) has difficulty differentiating between some geographically distinct installations of the same institution (e.g. the Mississippi State University Main Campus in Starkville vs the Mississippi State University Coastal Research and Extension located 250 miles away in Biloxi). +(4) Georeferencing with the Google Maps API is slightly more powerful - while `refsplitr`'s OSM queries use city and country (also state if in the USA) to georeference, the Google Maps queries can include the name of the institution. This is useful if the city is not included in an author's address, but it can also result in the same lat-long being returned for geographically distinct facilities of the same institution (e.g. the Mississippi State University Main Campus (in Starkville) and the Mississippi State University Coastal Research and Extension (in Biloxi)). #### Example @@ -279,7 +283,8 @@ a. to georeference author institutions: ```r example_georef <-authors_georef(data=example_a_refined, - address_column = "address") + address_column = "address", + google_api = FALSE) ``` Note that while this function is being executed a message will be printed every time a location is geoprocessed. These messages can be suppressed by modifying the function call as follows: @@ -514,7 +519,7 @@ Table: Definition of column headings in the output of read_references()^1^. Most |refID |a unique identifier for each article in the dataset assigned by refnet | -^1^the following Web of Science data fields are only included if users select the `include_all=TRUE` option in `references_read()`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR. +^1^the following Web of Science data fields are only included if users select the `include_all=TRUE` option in `references_read()`: CC, CH, CL, CT, CY, DT, FX, GA, GE, J9, LA, PA, PI, PN, PS, RID, SU, VR, OA. ^2^Includes citations in the Web of Science Core Collection, BIOSIS Citation Index, Chinese Science Citation Database, Data Citation Index, Russian Science Citation Index, and SciELO Citation Index. From 0b863fa0c3d0f381d1bedd171efb52ac1ea559ba Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 13:39:22 -0400 Subject: [PATCH 20/34] no longer uses deprecated `fortify`, update of vignette and news --- NEWS.md | 41 ++++++++++++++++++++++++++++++++--------- R/authors_address.R | 9 ++++++--- R/authors_georef.R | 8 ++++++-- R/plot_net_address.R | 5 ++++- vignettes/refsplitr.Rmd | 10 ++++++---- 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2e748af..aab68f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,14 +1,22 @@ # refsplitr News -refsplitr 1.5 (2025-04-15) +refsplitr 1.2 (2025-04-26) ========================= ### NEW FEATURES - * The new default service for georeferencing author institutions is [`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/). Using [`ggmap`](https://github.com/dkahle/ggmap) to access the Google Maps API is still an option, but users should be aware that this is no longer a free service. + * The new default service for georeferencing author institutions is the free + [Nominatim](https://nominatim.org/) service, which uses OpenStreetMap (OSM) data and + which `refsplitr` queries via the [`tidygeocoder`]((https://jessecambon.github.io/tidygeocoder/) + package.[`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/). + The Google Maps API is still an option, but users should be aware that their + georeferencing request may exceed the lower limit of free queries. - * The `authors_addresses` function has been updated. + * The `authors_addresses` function has been updated and is now more efficient. + + * In `plot_net_address`: the deprecated function `fortify` has been replaced + with `sf_convert` refsplitr 1.0.2 (2024-08-12) @@ -17,11 +25,21 @@ refsplitr 1.0.2 (2024-08-12) ### NEW FEATURES - * `references_read` now extracts additional fields from Web of Science records: WE (Source Database), C3 (all author affiliations, equivalent to the Scopus `affiliations` field code), EI (eISSN), OA (Open Access), and RID (the original version of the Thomson-Reuters ResearcherID (RI); authors of some older publications might have an RID but not an RI). These are not included in the default output of `references_read`, to include them use `include_all = TRUE`. + * `references_read` now extracts additional fields from Web of Science + records: WE (Source Database), C3 (all author affiliations, equivalent to the + Scopus `affiliations` field code), EI (eISSN), OA (Open Access), and RID + (the original version of the Thomson-Reuters ResearcherID (RI); authors of + some older publications might have an RID but not an RI). These are not + included in the default output of `references_read`; to include + them use `include_all = TRUE`. - * `references_read` no longer extracts some rarely used field codes: GE, LT, MC, MI, and TA + * `references_read` no longer extracts some rarely used field codes: + GE, LT, MC, MI, and TA - * The following field codes are now returned by default when using `references_read`: DT (Document Type), ID (Keywords Plus), IS (Issue), JI (ISO abbreviated source code), and NR (number of references cited by the article). + * The following field codes are now returned by default when using + `references_read`: DT (Document Type), ID (Keywords Plus), IS (Issue), + JI (ISO abbreviated source code), and NR (number of references cited + by the article). refsplitr 1.0.1 (2024-07-23) @@ -29,15 +47,20 @@ refsplitr 1.0.1 (2024-07-23) ### NEW FEATURES - * output of `plot_net_country()` now includes a list of any authors that have a lat-lon but no country (called with `products$fixable_countries`).Users can correct these and re-run the visualization to include them in the graph. + * output of `plot_net_country()` now includes a list of any authors that have + a lat-lon but no country (called with `products$fixable_countries`).Users can + correct these and re-run the visualization to include them in the graph. ### DEPRECATED AND DEFUNCT - * Removed the dependency on deprecated package [maptools](https://cran.r-project.org/web/packages/maptools/index.html). [(#90)](https://github.com/ropensci/refsplitr/issues/90) + * Removed the dependency on deprecated package + [maptools](https://cran.r-project.org/web/packages/maptools/index.html). + [(#90)](https://github.com/ropensci/refsplitr/issues/90) ### DOCUMENTATION FIXES - * Updated README with citation of the _Journal of Open Source Software_ article describing refsplitr. + * Updated README with citation of the _Journal of Open Source Software_ + article describing refsplitr. refsplitr 0.9.0 (2020-01-14) diff --git a/R/authors_address.R b/R/authors_address.R index 3c67a14..8fc3f50 100644 --- a/R/authors_address.R +++ b/R/authors_address.R @@ -15,7 +15,7 @@ authors_address <- function(addresses, ID) { addresses <- tolower(addresses) - message("\nSplitting addresses\n") + message("\nSplitting addresses.\n") list_address <- strsplit(addresses, ",") @@ -985,9 +985,12 @@ authors_address <- function(addresses, ID) { # Final clean-up of some Brazil cities and states ------------------------- + a_df$city <- ifelse(a_df$country=="brazil" & grepl("seropedica", a_df$city), + "seropedica", + a_df$city + ) - - a_df$city <- ifelse(a_df$city == "gavea rio de janeiro", + a_df$city <- ifelse(a_df$country == "brazil" & a_df$city == "gavea rio de janeiro", "rio de janeiro", a_df$city ) diff --git a/R/authors_georef.R b/R/authors_georef.R index c9c2e2a..ca73bf9 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -23,7 +23,8 @@ #' #' @examples #' \dontrun{ -#' BITR_georef_df <- authors_georef(BITR_refined, address_column = "address") +#' BITR_georef_df <- authors_georef(BITR_refined, address_column = "address", +#' google_api=FALSE) #' } #' @export authors_georef #' @@ -223,6 +224,9 @@ authors_georef <- function( on.exit(options(ggmap = list(display_api_key = TRUE))) return(outputlist) } else { + + requireNamespace(package = "tidygeocoder", quietly = TRUE) + pt1 <- ("You are Geocoding with OpenStreetMap.\n") pt2 <- ("This proceeds at a rate of 1 address/second.\n") pt3 <- ("For large data sets: OSM requests that you consider downloading\n") @@ -309,7 +313,7 @@ authors_georef <- function( # to_georef_df <- na.omit(to_georef_df) - message(paste("Number of addresses being geocoded: ", + message(paste("Number of locations being geocoded: ", nrow(to_georef_df), sep = "")) diff --git a/R/plot_net_address.R b/R/plot_net_address.R index 691bd9c..58b8405 100644 --- a/R/plot_net_address.R +++ b/R/plot_net_address.R @@ -193,7 +193,10 @@ plot_net_address <- function(data, } ## Create the world outlines: world_map@data$id <- rownames(world_map@data) - world_map.points <- ggplot2::fortify(world_map) + + + # world_map.points <- ggplot2::fortify(world_map) # deprecated + world_map.points <- sf_convert(world_map) world_map.df <- merge(world_map.points, world_map@data, by = "id", all = TRUE) world_map.df <- world_map.df[!is.na(world_map.df$lat), ] diff --git a/vignettes/refsplitr.Rmd b/vignettes/refsplitr.Rmd index fc8d9e3..817f818 100644 --- a/vignettes/refsplitr.Rmd +++ b/vignettes/refsplitr.Rmd @@ -267,13 +267,15 @@ The output of `authors_georef()` is a list with three elements: (1) `addresses` #### **WARNINGS**: -(1) The `authors_georef()` function requires address be data type = character. If importing a .csv file with the results of `authors_refine()` for processing with `authors_georef()`, be sure to include "stringsAsFactors = FALSE" in the `read.csv` command. +(1) The `authors_georef()` function requires address be data type = character. If importing a .csv file with the results of `authors_refine()` for processing with `authors_georef()`, be sure to include "stringsAsFactors = FALSE" in the `read.csv` command. -(2) The `authors_georef()` function parses addresses from the Web of Science reference sheet and then uses one of two services to query the latitude and longitude of each address. The default option is the free [Nominatim](https://nominatim.org/) service, which uses OpenStreetMap (OSM) data and which `refsplitr` queries via the [`tidygeocoder`]((https://jessecambon.github.io/tidygeocoder/) package. Alternatively, users may elect to geocode via the [Data Science Toolkit](http://www.datasciencetoolkit.org/), which uses the Google maps API. This service allows a limited number of free queries (currently 10,000 per month) before charging users. _Many bibliometric projects will exceed this limit_. Please be sure to check the Google Maps [pricing information](https://mapsplatform.google.com/pricing/?utm_experiment=13102311) see if your research will do so and how much you will be billed for queries over this amount. You may be able to get free additional credits if you are based at a nonprofit; Google also offers faculty free credits via their teaching program. Additional information can be found [here](https://developers.google.com/maps/billing-and-pricing/public-programs?hl=en&_gl=1*1cwzikc*_ga*MTAxMzA3OTgzMy4xNzQyNjY5MzU4*_ga_NRWSTWS78N*MTc0MjgyOTEzNS4yLjEuMTc0MjgyOTI4Mi4wLjAuMA). +(2) The `authors_georef()` function parses addresses from the Web of Science reference sheet and then uses one of two services to query the latitude and longitude of each address. The default option is the free [Nominatim](https://nominatim.org/) service, which uses OpenStreetMap (OSM) data and which `refsplitr` queries via the [`tidygeocoder`]((https://jessecambon.github.io/tidygeocoder/) package. Alternatively, users may elect to geocode via the [Data Science Toolkit](http://www.datasciencetoolkit.org/), which uses the Google maps API. This service allows a limited number of free queries (currently 10,000 per month) before charging users. _Many bibliometric projects will exceed this limit_. Please be sure to check the Google Maps [pricing information](https://mapsplatform.google.com/pricing/?utm_experiment=13102311) see if your research will do so and how much you will be billed for queries over this amount. You may be able to get free additional credits if you are based at a nonprofit; Google also offers faculty free credits via their teaching program. Additional information can be found [here](https://developers.google.com/maps/billing-and-pricing/public-programs?hl=en&_gl=1*1cwzikc*_ga*MTAxMzA3OTgzMy4xNzQyNjY5MzU4*_ga_NRWSTWS78N*MTc0MjgyOTEzNS4yLjEuMTc0MjgyOTI4Mi4wLjAuMA). -(3) To use the Google Maps option you will need a Google Maps API key. Details on how to register for one are available on the [`ggmap`](https://github.com/dkahle/ggmap?tab=readme-ov-file#google-maps-api-key) repository. +(3) Georeferencing using OSM proceeds at a rate of 1 address/second. For large data sets, OSM requests that you consider downloading the complete database to query locally instead of using the API; in addition to reducing demand on their servers this also has the advantage of being much faster. Instructions on how to do so can be found [here](https://nominatim.org/). -(4) Georeferencing with the Google Maps API is slightly more powerful - while `refsplitr`'s OSM queries use city and country (also state if in the USA) to georeference, the Google Maps queries can include the name of the institution. This is useful if the city is not included in an author's address, but it can also result in the same lat-long being returned for geographically distinct facilities of the same institution (e.g. the Mississippi State University Main Campus (in Starkville) and the Mississippi State University Coastal Research and Extension (in Biloxi)). +(4) To use the Google Maps option you will need a Google Maps API key. Details on how to register for one are available on the [`ggmap`](https://github.com/dkahle/ggmap?tab=readme-ov-file#google-maps-api-key) repository. + +(5) Georeferencing with the Google Maps API is slightly more powerful - while `refsplitr`'s OSM queries use city and country (also state if in the USA) to georeference, the Google Maps queries can include the name of the institution. This is useful if the city is not included in an author's address, but it can also result in the same lat-long being returned for geographically distinct facilities of the same institution (e.g. the Mississippi State University Main Campus (in Starkville) and the Mississippi State University Coastal Research and Extension (in Biloxi)). #### Example From 5f65780c40d21242b87677dea89ac3a17c4ff493 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 13:54:57 -0400 Subject: [PATCH 21/34] update messages --- R/authors_address.R | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/R/authors_address.R b/R/authors_address.R index 8fc3f50..cea7f43 100644 --- a/R/authors_address.R +++ b/R/authors_address.R @@ -16,6 +16,7 @@ authors_address <- function(addresses, ID) { addresses <- tolower(addresses) message("\nSplitting addresses.\n") + list_address <- strsplit(addresses, ",") @@ -85,10 +86,14 @@ authors_address <- function(addresses, ID) { "syrian arab rep" = "syria" ) + message("\nstandardizing country names...\n") + list_address <- correct_countries(list_address, replacements) # extract university ------------------------------------------------------ + message("\nextracting the names of institutions...\n") + university_list <- vapply(list_address, function(x) x[1], character(1)) # extract department ------------------------------------------------------ @@ -112,6 +117,8 @@ authors_address <- function(addresses, ID) { # Extract City ------------------------------------------------------------ + message("\nextracting cities...\n") + # If there is only one element, then it can't have both city and country' city_list <- vapply(list_address, function(x) { n <- length(x) @@ -134,7 +141,8 @@ authors_address <- function(addresses, ID) { # extract state ----------------------------------------------------------- - + message("\nextracting states/provinces...\n") + # If there is only one element, then it can't have both city and country' state_list <- vapply(list_address, function(x) { n <- length(x) @@ -160,6 +168,8 @@ authors_address <- function(addresses, ID) { # Extract Country --------------------------------------------------------- + message("\nextracting country...\n") + country_list <- vapply( list_address, function(x) { gsub("\\_", "", x[length(x)]) @@ -170,11 +180,16 @@ authors_address <- function(addresses, ID) { # postal code (pc) list --------------------------------------------------- + message("\nprocessing postal codes...\n") + # pc often with city pc_list <- city_list # bind all into df -------------------------------------------------------- + + message("\nreview, correction, and clean-up...\n") + message("\nPlease be patient - this might take a bit.\n") a_df <- data.frame( adID = ID, @@ -362,7 +377,8 @@ authors_address <- function(addresses, ID) { a_df$city ) - + message("\n(still working on it...)\n") + # repeat the clean of city a_df$city <- ifelse(a_df$country == "brazil", gsub("br-", "", a_df$city), @@ -587,6 +603,8 @@ authors_address <- function(addresses, ID) { a_df$city ) + message("\n(getting closer...)\n") + # TODO: england still needs work a_df$state <- ifelse(a_df$country == "scotland" | @@ -807,7 +825,7 @@ authors_address <- function(addresses, ID) { a_df[] <- lapply(a_df, trimws) - + message("\n(not much longer...)\n") # This verifies that what is in `city` is actually a city # (or at least that what is in `city` is NOT a province) @@ -985,6 +1003,8 @@ authors_address <- function(addresses, ID) { # Final clean-up of some Brazil cities and states ------------------------- + message("\n(almost done...)\n") + a_df$city <- ifelse(a_df$country=="brazil" & grepl("seropedica", a_df$city), "seropedica", a_df$city @@ -1110,7 +1130,7 @@ authors_address <- function(addresses, ID) { # still some us states not extracting properly but fixed here ------------- - + message("\n(so close...the end is in sight!)\n") us_state_abbreviations_lower <- c( "al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", @@ -1256,7 +1276,9 @@ authors_address <- function(addresses, ID) { # fine-tuning ENGLAND ----------------------------------------------------- - + message("\n(this is it - the last step!)\n") + + to_delete <- c( "&", "inst", "ctr", "med", "chem", "lab", "biol", "dept", "div", "univ", "hosp", "coll", "sci", "rd", From 13ab87b3c33380a1073a9131f30709b95e5f368e Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 14:44:08 -0400 Subject: [PATCH 22/34] edit test styatement to remove white space --- tests/testthat/test_authors_address.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_authors_address.R b/tests/testthat/test_authors_address.R index d0793c0..4fdbb4a 100644 --- a/tests/testthat/test_authors_address.R +++ b/tests/testthat/test_authors_address.R @@ -16,10 +16,10 @@ address=c("Univ Sydney, Fac Vet Sci, Sch Life & Environm Sci, actual<-authors_address(df$address, df$authorID) expect_false(any(is.na(actual$country))) expect_false(grepl('BR',actual$postal_code[actual$country=='brazil'])) -expect_equal(sum(grepl("[a-z]{1}[0-9]{1}[a-z]{1}\\s[0-9]{1}[a-z]{1}[0-9]{1}", +expect_equal(sum(grepl("[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{1}", actual$postal_code)),2) expect_equal(unique(actual$country[grepl( - "[a-z]{1}[0-9]{1}[a-z]{1}\\s[0-9]{1}[a-z]{1}[0-9]{1}", + "[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{1}", actual$postal_code)]),'canada') expect_equal(c(actual$country[8], actual$state[8], actual$postal_code[8]), c('usa','fl','33312')) From b49ed507627415a1ac0546e8245cb2020e953ae6 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 19:48:36 -0400 Subject: [PATCH 23/34] change ifelse to if / if --- DESCRIPTION | 2 +- R/authors_georef.R | 4 +++- man/BITR.Rd | 4 ++-- man/BITR_geocode.Rd | 8 ++++---- man/authors_georef.Rd | 37 ++++++++++++++++++------------------- man/references_read.Rd | 17 ++++++++--------- man/refsplitr-package.Rd | 2 +- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4bf0f72..b873658 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -64,7 +64,7 @@ VignetteBuilder: Remotes: dkahle/ggmap Encoding: UTF-8 -RoxygenNote: 7.1.2 +RoxygenNote: 7.3.2 X-schema.org-keywords: name disambiguation, bibliometrics, diff --git a/R/authors_georef.R b/R/authors_georef.R index ca73bf9..7156523 100644 --- a/R/authors_georef.R +++ b/R/authors_georef.R @@ -223,7 +223,9 @@ authors_georef <- function( # reset ggmaps option to TRUE. This only until the ggmaps gets fixed on.exit(options(ggmap = list(display_api_key = TRUE))) return(outputlist) - } else { + } + + if (google_api != TRUE) { requireNamespace(package = "tidygeocoder", quietly = TRUE) diff --git a/man/BITR.Rd b/man/BITR.Rd index 275ed53..ae1e13f 100644 --- a/man/BITR.Rd +++ b/man/BITR.Rd @@ -3,7 +3,7 @@ \docType{data} \name{BITR} \alias{BITR} -\title{Data from the journal BioTropica (pulled from Web of Knowledge)} +\title{Data from the journal Biotropica (pulled from Web of Knowledge)} \format{ A data frame with 10 rows and 32 variables: \describe{ @@ -47,7 +47,7 @@ The remaining codes are described on the Web of Knowledge website: BITR } \description{ -A dataset containing 10 articles taken from the BioTropica journal. +A dataset containing 10 articles taken from the journal Biotropica. This dataset represents the typical formatted output from \code{references_read()} in the refsplitr package. It serves as a testbed for commonly miscategorized names } diff --git a/man/BITR_geocode.Rd b/man/BITR_geocode.Rd index ef960ba..1351a7e 100644 --- a/man/BITR_geocode.Rd +++ b/man/BITR_geocode.Rd @@ -3,7 +3,7 @@ \docType{data} \name{BITR_geocode} \alias{BITR_geocode} -\title{Georeferenced data from the journal BioTropica (pulled from Web of Science)} +\title{Georeferenced data from the journal Biotropica (pulled from Web of Science)} \format{ A data frame with 41 rows and 15 variables: \describe{ @@ -15,8 +15,8 @@ institution for non-universities} \item{lat}{numeric, latitude populated from authors_georef} \item{lon}{numeric, longitude populated from authors_georef} \item{groupID}{ID field for what name group the author -is identied as from authors_clean()} -\item{author_order}{numeric, order of author from jounral article} +is identified as from authors_clean()} +\item{author_order}{numeric, order of author from journal article} \item{address}{address of references pulled from the original raw WOS file} \item{department}{department which is nested within university} @@ -34,7 +34,7 @@ given by references_read()} BITR_geocode } \description{ -A dataset containing 41 authors taken from the BioTropica journal. +A dataset containing 41 authors taken from the Biotropica journal. This dataset represents the typical formatted output from \code{authors_georef()} in the refsplitr package. It serves as a useful testing data set for diff --git a/man/authors_georef.Rd b/man/authors_georef.Rd index 01546c6..0debb64 100644 --- a/man/authors_georef.Rd +++ b/man/authors_georef.Rd @@ -4,37 +4,36 @@ \alias{authors_georef} \title{Extracts the lat and long for each address from authors_clean} \usage{ -authors_georef(data, address_column = "address") +authors_georef(data, address_column = "address", google_api = FALSE) } \arguments{ \item{data}{dataframe from \code{authors_refine()}} \item{address_column}{name of column in quotes where the addresses are} + +\item{google_api}{if \code{google_api = FALSE} georeferencing is carried out with +the \code{tidygeocoder} package (option \code{geocode()} with \code{method = 'osm'}). +If \code{google_api = TRUE}, then geocoding is done with the Google Maps API. +Defaults to \code{FALSE}.} } \description{ \code{authors_georef} This function takes the final author list from -refine_authors, and calculates the lat long of the addresses. -It does this by feeding the addresses into data science toolkit. -In order to maximize effectiveness and mitigate errors in parsing addresses -We run this multiple times creating addresses in different ways -in hopes that the google georeferencing API can recognize an address -1st. University, city, zipcode, country -2nd. City, zipcode, country -3rd. city, country -4th. University, country +refine_authors, and calculates the lat long of the city, country, and postal +code (for USA addresses) or city and country (for addresses outside the USA). } \details{ -The output is a list with three data.frames -\code{addresses} is a data frame with all information from -refine_authors plus new location columns and calculated lat longs. -\code{missing addresses} is a data frame with all addresses could -not be geocoded -\code{addresses} is a data frame like \code{addresses} except -the missing addresses are gone. +The output is a list of three data.frames +\code{addresses} All info from 'refine_authors' plus new columns with +lat & long. It includes ALL addresses, including those that could not +be geocoded. +\code{missing_addresses} A data frame of the addresses that could +NOT be geocoded. +\code{no_missing_addresses} the \code{addresses} data frame with ONLY the +addresses that were geocoded. } \examples{ - \dontrun{ -BITR_georef_df <- authors_georef(BITR_refined, address_column='address') +BITR_georef_df <- authors_georef(BITR_refined, address_column = "address", +google_api=FALSE) } } diff --git a/man/references_read.Rd b/man/references_read.Rd index aba14e6..2c43b01 100644 --- a/man/references_read.Rd +++ b/man/references_read.Rd @@ -9,7 +9,7 @@ references_read(data = ".", dir = FALSE, include_all = FALSE) \arguments{ \item{data}{the location of the file or files to be imported. This can be either the absolute or relative name of the file (for a single file) or folder (for multiple files stored in the same folder; -used in conjuction with `dir = TRUE``). If left blank it is assumed the location is the working directory.} +used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory.} \item{dir}{if FALSE it is assumed a single file is to be imported. Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; @@ -17,25 +17,24 @@ all files in the folder will be imported). Defaults to FALSE.} \item{include_all}{if FALSE only a subset of commonly used fields from references records are imported. If TRUE then all fields from the reference records are imported. Defaults to FALSE. -The additional data fields included if \code{include_all=TRUE}: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, -LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR.} +The additional data fields included if \code{include_all=TRUE}: CC, CH, CL, CT, CY, FX, GA, J9, +LA, PA, PI, PN, PS, RID, SU, VR, OA.} } \description{ \code{references_read} This function reads Thomson Reuters Web of Knowledge and ISI format reference data files into an R-friendly data format. The resulting dataframe -is the argument for the refplitr function \code{authors_clean()}. +is the argument for the refsplitr function \code{authors_clean()}. } \examples{ -## If a single files is being imported from a folder called "data" located in an RStudio Project: +## If a single files is being imported from a folder called "data" located in an RStudio Project: ## imported_refs<-references_read(data = './data/refs.txt', dir = FALSE, include_all=FALSE) ## If multiple files are being imported from a folder named "heliconia" nested within a folder -## called "data" located in an RStudio Project: +## called "data" located in an RStudio Project: ## heliconia_refs<-references_read(data = './data/heliconia', dir = TRUE, include_all=FALSE) -## To load the Web of Science records used in the examples in the documentation -BITR_data_example <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr') +## To load the Web of Science records used in the examples in the documentation +BITR_data_example <- system.file("extdata", "BITR_test.txt", package = "refsplitr") BITR <- references_read(BITR_data_example) - } diff --git a/man/refsplitr-package.Rd b/man/refsplitr-package.Rd index 495c191..f8b4e96 100644 --- a/man/refsplitr-package.Rd +++ b/man/refsplitr-package.Rd @@ -6,7 +6,7 @@ \alias{refsplitr-package} \title{refsplitr: author name disambiguation, author georeferencing, and mapping of coauthorship networks with 'Web of Science' data} \description{ -\if{html}{\figure{logo.png}{options: align='right' alt='logo' width='120'}} +\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} Tools to parse and organize reference records downloaded from the 'Web of Science' citation database into an R-friendly format, disambiguate the names of authors, geocode their locations, and generate/visualize coauthorship networks. This package has been peer-reviewed by rOpenSci (v. 1.0). } From fd6f667abee455bde98a140645f8e19de7b84873 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 19:48:45 -0400 Subject: [PATCH 24/34] update --- NAMESPACE | 4 ---- 1 file changed, 4 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index a1fac86..776134d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,10 +10,6 @@ export(plot_net_coauthor) export(plot_net_country) export(references_read) importFrom(ggmap,geocode) -importFrom(ggplot2,theme) -importFrom(dplyr,arrange) -importFrom(dplyr,tally) -importFrom(magrittr, "%>%") importFrom(network,"%v%") importFrom(rworldmap,addMapLegend) importFrom(stats,na.omit) From 1c02339b5e8a5fb98d714b5f71ba416329af2715 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 20:15:17 -0400 Subject: [PATCH 25/34] adding license to DESCRIPTION --- DESCRIPTION | 2 +- R/authors_clean.R | 10 ---------- R/authors_match.R | 7 +------ R/plot_net_country.R | 3 --- 4 files changed, 2 insertions(+), 20 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b873658..c27d96b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ Description: Tools to parse and organize reference records downloaded from the the names of authors, geocode their locations, and generate/visualize coauthorship networks. This package has been peer-reviewed by rOpenSci (v. 1.0). -License: GPL-3 +License: GNU General Public License v3.0 (GPL-3.0) URL: https://github.com/ropensci/refsplitr, https://docs.ropensci.org/refsplitr/ BugReports: https://github.com/ropensci/refsplitr/issues Depends: diff --git a/R/authors_clean.R b/R/authors_clean.R index e68b9df..655036a 100644 --- a/R/authors_clean.R +++ b/R/authors_clean.R @@ -91,16 +91,6 @@ with the contents of column CA.', sep=" ") final <- final[, c(cols, colnames(final)[!colnames(final) %in% cols])] - # sub_authors <- final %>% - # filter( - # groupID %in% final$groupID[!is.na(similarity) | flagged == 1] - # ) %>% - # select(authorID, AU, AF, groupID, match_name, matchID, - # similarity, confidence, university, department, - # postal_code, country, address, RP_address, RI, - # OI, EM, UT, author_order, refID, PT, PY, PU) %>% - # arrange(groupID, similarity, authorID) - # sub_authors <- subset(final, groupID %in% groupID[!is.na(similarity) | flagged == 1], select = c( diff --git a/R/authors_match.R b/R/authors_match.R index 988a77a..63a7731 100644 --- a/R/authors_match.R +++ b/R/authors_match.R @@ -139,12 +139,7 @@ authors_match <- function(data){ if (any(matched_df$merged[matched_df$groupID == q])) next sub <- matched_df[matched_df$groupID == q, ] - # common_df <- matched_df %>% - # dplyr::filter( - # squash %in% sub$squash & - # ( (f_c %in% 1) | (f_c > 1 & first %in% sub$first) ) & - # groupID != q - # ) + common_df <- subset(matched_df, squash %in% sub$squash & ( (f_c %in% 1) | (f_c > 1 & first %in% sub$first) ) & diff --git a/R/plot_net_country.R b/R/plot_net_country.R index d619631..1e5c2c3 100644 --- a/R/plot_net_country.R +++ b/R/plot_net_country.R @@ -96,9 +96,6 @@ plot_net_country <- function(data, .default = as.character(country) )) - # are there any without lat/lon but WITH country? - # data %>% filter(is.na(lat)==TRUE) %>% distinct(country) - ## we could use a sparse matrix representation: linkages <- Matrix::spMatrix( From e68c68004e4c24feb62b5df51efd8d7b503f4e5c Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 20:26:47 -0400 Subject: [PATCH 26/34] update vignette Rmd --- vignettes/refsplitr.Rmd | 311 ++++++++++++++++++++++++++++------- vignettes/refsplitr.Rmd.orig | 76 ++++++--- 2 files changed, 303 insertions(+), 84 deletions(-) diff --git a/vignettes/refsplitr.Rmd b/vignettes/refsplitr.Rmd index 817f818..60444f1 100644 --- a/vignettes/refsplitr.Rmd +++ b/vignettes/refsplitr.Rmd @@ -1,7 +1,7 @@ --- title: "refsplitr" author: "Auriel M. V. Fournier, Matthew E. Boone, Forrest R. Stevens, Emilio M. Bruna" -date: "2022-02-04" +date: "2025-03-24" output: rmarkdown::html_vignette: fig_width: 6 @@ -43,7 +43,7 @@ The `refsplitr` package can either import a single Web of Science search result - **dir**: when loading a single file dir=FALSE, when loading multiple files dir=TRUE. If multiple files are processed `refsplitr` will identify and remove any duplicate reference records. -- **include_all**: Setting 'include_all=TRUE' will import all fields from the WOS record (see Appendix 2). The defailt is 'include_all=FALSE'. +- **include_all**: Setting 'include_all=TRUE' will import all fields from the WOS record (see Appendix 2). The default is 'include_all=FALSE'. The output of `references_read()` is an object in the R workspace. Each line of the output is a reference; the columns are the name of the .txt file from which the data were extracted, a unique id number assigned by `refsplitr` to each article, and the data from each field of the reference record (see Appendix 2 for a list of these data fields and their [Web of Science](https://images.webofknowledge.com/images/help/WOS/hs_wos_fieldtags.html) and RIS codes). This object is used by `refsplitr` in Step 2.2; we recommend also saving it as a .csv file in the "output" folder. @@ -52,8 +52,8 @@ The output of `references_read()` is an object in the R workspace. Each line of a. To import and process a single file, set dir=FALSE and set data equal to the file path. For example, if the file "example_data.txt" were saved in the "data" folder of the RStudio project, you would import and process the data file as follows: - -```r + +``` r example_refs <- references_read(data = "./data/example_data.txt", dir=FALSE, include_all = FALSE) @@ -63,7 +63,7 @@ example_refs <- references_read(data = "./data/example_data.txt", b. To import and process multiple files, set "dir = TRUE" and use "data=" to indicate the folder containing the files. For instance, if the files were saved in a folder called "UF_data" inside the "data" folder of the RStudio project, they would be imported and processed as follows: -```r +``` r example_refs <- references_read(data = "./data/UF_data", dir=TRUE, include_all = FALSE) @@ -73,7 +73,7 @@ example_refs <- references_read(data = "./data/UF_data", c. The sample data used in the examples below can be loaded and processed as follows: -```r +``` r example_refs <- references_read(data = system.file("extdata",package = "refsplitr"), dir = TRUE, include_all = FALSE) @@ -82,11 +82,14 @@ example_refs <- references_read(data = system.file("extdata",package = "refsplit d. The processed references can then be saved as a .csv file in the "output" folder of the RStudio project: -```r +``` r write.csv(example_refs,"./output/example_refs.csv") ``` -plot of chunk unnamed-chunk-5 +
+plot of chunk unnamed-chunk-5 +

plot of chunk unnamed-chunk-5

+
**Figure 1** An image of the .csv file showing a subset of the rows and columns from the output of `references_read()`. @@ -106,19 +109,22 @@ Once disambiguation is complete, users can accept `refsplitr`'s preliminary resu a. To disambiguate the authors of the references in the dataset: -```r +``` r example_a_clean <- authors_clean(example_refs) ``` b. To save the resulting list elements -- 'prelim' and 'review' -- in the "output" folder of the RStudio project as .csv files: -```r +``` r write.csv(example_a_clean$prelim,"./output/example_a_clean_prelim.csv") write.csv(example_a_clean$review,"./output/example_a_clean_review.csv") ``` -plot of chunk unnamed-chunk-8 +
+plot of chunk unnamed-chunk-8 +

plot of chunk unnamed-chunk-8

+
**Figure 2** A subset of the .csv file showing the rows and columns in the 'prelim' output from `authors_clean()`. @@ -138,18 +144,18 @@ The output of `authors_refine()` is an object in the R workspace. We recommend s #### Example -a. To accept the results of author disambiguation _**without**_ manual review (with **_default_** values for `sim_score` and `confidence`): +a. To accept the results of author disambiguation _**without**_ manual review (with default values for `sim_score` and `confidence`): -```r +``` r example_a_refined <- authors_refine(example_a_clean$review, example_a_clean$prelim) ``` -b. To accept the results of author disambiguation _**without**_ manual review (with **_user-modified_** values for `sim_score` and `confidence`): +b. To accept the results of author disambiguation _**without**_ manual review (with user-modified values for `sim_score` and `confidence`): -```r +``` r example_a_refined_mod <- authors_refine(example_a_clean$review, example_a_clean$prelim, sim_score = 0.70, @@ -159,7 +165,7 @@ example_a_refined_mod <- authors_refine(example_a_clean$review, c. to save the final disambiguated (i.e., 'refined') dataset to the "output" folder of the RStudio Project: -```r +``` r write.csv(example_a_refined,"./output/example_a_refined.csv") ``` @@ -173,31 +179,43 @@ There are two kinds of potential disambiguation errors. First, diffent authors c These corrections are uploaded using the ```authors_refine()``` function, which then generates the 'refined' dataset used in analyses (see Section 2.2.3). -**WARNING:** Tests of our disambiguation algorithm indicate it is very accurate, but no method is perfect and errors are inevitable - especially as the number of author name variants increases. However, finding these errors becomes increasigly challegning as the number of references processed increases. This is because the number of names on the _review.csv file will increase as more authors, and hence author name variants, are identified. **We strongly recommend using [code we have written](https://github.com/embruna/refsplitr_simplify_authors_review) to streamline the process of reviewing the output of ```authors_refine()```**. This code divides the list of names to review into more manageable subgroups; any errors identified are then corrected on the "_review.csv" file. The code and instructions for using it are available at [https://github.com/embruna/refsplitr_simplify_authors_review](https://github.com/embruna/refsplitr_simplify_authors_review). +**WARNING:** Tests of our disambiguation algorithm indicate it is very accurate, but no method is perfect and errors are inevitable - especially as the number of author name variants increases. However, finding these errors becomes increasingly challenging as the number of references processed increases. This is because the number of names on the _review.csv file will increase as more authors, and hence author name variants, are identified. **We strongly recommend using [code we have written](https://github.com/embruna/refsplitr_simplify_authors_review) to streamline the process of reviewing the output of ```authors_refine()```**. This code divides the list of names to review into more manageable subgroups; any errors identified are then corrected on the "_review.csv" file. The code and instructions for using it are available at [https://github.com/embruna/refsplitr_simplify_authors_review](https://github.com/embruna/refsplitr_simplify_authors_review). #### Example -Figure 3 is an example of the the first few rows and columns of the 'review' element of `authors_clean()`. Each row is the author of a paper, with their name as it is on the author list (AF). Each author has been assignd a unique authorID number as well assigned to a groupID; the "match_name" column provides the name under which the algorithm has grouped all of an author’s putative name variants when assigning the groupID number. +Figure 3 is an example of the the first few rows and columns of the 'review' element of `authors_clean()`. Each row is the author of a paper, with their name as it is on the author list (AF). Each author has been assigned a unique authorID number as well assigned to a groupID; the "match_name" column provides the name under which the algorithm has grouped all of an author’s putative name variants when assigning the groupID number. -plot of chunk unnamed-chunk-12 +
+plot of chunk unnamed-chunk-12 +

plot of chunk unnamed-chunk-12

+
**Figure 3:** First rows and columns from the review element of `authors_clean()` A review of this output indicates that `refsplitr` assigned three authors sharing the last name "Bailey" and first initial "J" to groupID number 982: John Bailey (authorID 2240), JW Bailey (authorID 982), and J Bailey (authorID 1231; Figure 4). However, we know that J Bailey at Moon University is a distinct individual that should not be in the same group as the other two. Their incorrect groupID number (982) should be replaced with their authorID number (1231). -plot of chunk unnamed-chunk-13 +
+plot of chunk unnamed-chunk-13 +

plot of chunk unnamed-chunk-13

+
**Figure 4:** review element of `authors_clean()` highlighting three authors sharing the same last and assigned the same groupID. Further review reveals that there are two putative authors named LW Wise -- one at U of Middle Earth (groupID 89) and one at U of the Shire (groupID 90; Figure 5). However, an online search reveals that this is actually the same author, who recently moved from one university to the other. The groupID for all of these records should therefore be changed to "89". -plot of chunk unnamed-chunk-14 +
+plot of chunk unnamed-chunk-14 +

plot of chunk unnamed-chunk-14

+
**Figure 5:** review element of `authors_clean()` highlighting the same author incorrectly assigned to different groupID numbers. Once these corrections have been made (Figure 6) and saved, the changes can be incorporated using the ```authors_refine()``` function (see Section 2.2.3). -plot of chunk unnamed-chunk-15 +
+plot of chunk unnamed-chunk-15 +

plot of chunk unnamed-chunk-15

+
**Figure 6:** Corrected version of the 'review' element from `authors_clean()`. @@ -220,14 +238,14 @@ The output of `authors_refine()` is an object in the R workspace, which can be s a. To merge the changes made to the disambiguations, first load the .csv file with the corrections: -```r +``` r example_a_corrected <- read.csv("correctedfile.csv") ``` The changes are then merged into the preliminary disambiguation: -```r +``` r example_a_refined <-authors_refine(example_a_corrected, example_a_clean$prelim) ``` @@ -235,18 +253,21 @@ example_a_refined <-authors_refine(example_a_corrected, b. to save the final disambiguated (i.e., 'refined') dataset to the "output" folder of the RStudio project: -```r +``` r write.csv(example_a_refined,"./output/example_a_refined.csv") ``` -plot of chunk unnamed-chunk-19 +
+plot of chunk unnamed-chunk-19 +

plot of chunk unnamed-chunk-19

+
**Figure 7:** `authors_refine()` output. c. User-selected values for `sim_score` and `confidence` can be used to merge the changes made to the "review" file by adding the two relevant arguments to the `authors_refine()` function: -```r +``` r example_a_refined_mod <- authors_refine(example_a_corrected, example_a_clean$prelim, sim_score = 0.70, @@ -255,34 +276,30 @@ example_a_refined_mod <- authors_refine(example_a_corrected, ### 2.3. Georeferencing author institutions -Users can georeference author institutions (latitude & longitude) using the `authors_georef()` function. This function has has three arguments: +Users can georeference author's institutions (latitude & longitude) using the `authors_georef()` function. This function has has three arguments: - **data**: The output created by `authors_refine()`. Must be an object. - **address_column**: A quoted character identifying the column name in which addresses are stored. This defaults to the `address` column from the `authors_refine()` output. -- **google_api**: If `FALSE` the addresses will be referenced with the [`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/) package (free). If `TRUE` addresses will be georeferenced with the [Data Science Toolkit](http://www.datasciencetoolkit.org/), which uses the Google Maps API (paid after a threshold number of free queries). This defaults to `FALSE`. See **WARNINGS** below for additional details on pricing and howe to register for a Google Maps API key. +- ***google_api***: Defaults to `FALSE`. If `TRUE` georeferencing will be conducted with [`ggmap`](https://github.com/dkahle/ggmap) and the Google Maps API (paid service). If `FALSE` geocoding will be conducted with the [`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/) package, which uses the Nominatum service to access [OpenStreetMap data](https://nominatim.org/). Note that when georeferencing large data sets, OSM requests users consider [downloading and installing the complete database](https://nominatim.org/release-docs/latest/admin/Installation/) to query locally instead of using the API. -The output of `authors_georef()` is a list with three elements: (1) `addresses` contains all records, (2) `missing_addresses` contains the records that could not be georeferenced, and (3) `not_missing_addresses` contains only the records with georeferenced addresses. The time required to georeference depends on the number of addresses being processed, the speed of the internet connection, and the processing power of the computer on which analyses are being conducted. +The output of `authors_georef()` is a list with three elements: (1) `addresses` contains all records, (2) `missing_addresses` contains the records that could not be georeferenced, and (3) `not_missing_addresses` contains only the records with georeferenced addresses. #### **WARNINGS**: -(1) The `authors_georef()` function requires address be data type = character. If importing a .csv file with the results of `authors_refine()` for processing with `authors_georef()`, be sure to include "stringsAsFactors = FALSE" in the `read.csv` command. - -(2) The `authors_georef()` function parses addresses from the Web of Science reference sheet and then uses one of two services to query the latitude and longitude of each address. The default option is the free [Nominatim](https://nominatim.org/) service, which uses OpenStreetMap (OSM) data and which `refsplitr` queries via the [`tidygeocoder`]((https://jessecambon.github.io/tidygeocoder/) package. Alternatively, users may elect to geocode via the [Data Science Toolkit](http://www.datasciencetoolkit.org/), which uses the Google maps API. This service allows a limited number of free queries (currently 10,000 per month) before charging users. _Many bibliometric projects will exceed this limit_. Please be sure to check the Google Maps [pricing information](https://mapsplatform.google.com/pricing/?utm_experiment=13102311) see if your research will do so and how much you will be billed for queries over this amount. You may be able to get free additional credits if you are based at a nonprofit; Google also offers faculty free credits via their teaching program. Additional information can be found [here](https://developers.google.com/maps/billing-and-pricing/public-programs?hl=en&_gl=1*1cwzikc*_ga*MTAxMzA3OTgzMy4xNzQyNjY5MzU4*_ga_NRWSTWS78N*MTc0MjgyOTEzNS4yLjEuMTc0MjgyOTI4Mi4wLjAuMA). +(1) The `authors_georef()` function requires address be data type = character. If importing a .csv file with the results of `authors_refine()` for processing with `authors_georef()`, be sure to include "stringsAsFactors = FALSE" in the `read.csv` command. -(3) Georeferencing using OSM proceeds at a rate of 1 address/second. For large data sets, OSM requests that you consider downloading the complete database to query locally instead of using the API; in addition to reducing demand on their servers this also has the advantage of being much faster. Instructions on how to do so can be found [here](https://nominatim.org/). +(2) The `authors_georef()` function parses addresses from the Web of Science reference sheet and then attempts to calculate the latitude and longitude for them with the [Data Science Toolkit](http://www.datasciencetoolkit.org/). The time required to do so depends on the number of addresses being processed, the speed of the internet connection, and the processing power of the computer on which analyses are being conducted. -(4) To use the Google Maps option you will need a Google Maps API key. Details on how to register for one are available on the [`ggmap`](https://github.com/dkahle/ggmap?tab=readme-ov-file#google-maps-api-key) repository. - -(5) Georeferencing with the Google Maps API is slightly more powerful - while `refsplitr`'s OSM queries use city and country (also state if in the USA) to georeference, the Google Maps queries can include the name of the institution. This is useful if the city is not included in an author's address, but it can also result in the same lat-long being returned for geographically distinct facilities of the same institution (e.g. the Mississippi State University Main Campus (in Starkville) and the Mississippi State University Coastal Research and Extension (in Biloxi)). +(3) This version of `refsplitr` (v1.0) has difficulty differentiating between some geographically distinct installations of the same institution (e.g. the Mississippi State University Main Campus in Starkville vs the Mississippi State University Coastal Research and Extension located 250 miles away in Biloxi). #### Example a. to georeference author institutions: -```r +``` r example_georef <-authors_georef(data=example_a_refined, address_column = "address", @@ -292,20 +309,49 @@ example_georef <-authors_georef(data=example_a_refined, Note that while this function is being executed a message will be printed every time a location is geoprocessed. These messages can be suppressed by modifying the function call as follows: -```r +``` r + +example_georef <-suppressMessages(authors_georef( + data=example_a_refined, + address_column="address", + google_api = FALSE)) +#> Error in curl::curl_fetch_memory(url, handle = handle): Failed initialization [nominatim.openstreetmap.org] +``` + + +#### Registering with Google for an API key (NB: this is a paid service) + +1. Install and load the `ggmap` package + + +``` r + +install.packages("ggmap") +library(ggmap) -example_georef <-suppressMessages(authors_georef(data=example_a_refined, - address_column = "address")) +``` + +2. Register for a Google [Geocoding API](https://developers.google.com/maps/documentation/geocoding/overview) by following the instructions on the `READ ME` of the [`ggmap`](https://github.com/dkahle/ggmap) repository. + +3. Once you have your API key, add it to your `~/.Renviron` with the following: + +``` r +`ggmap::register_google(key = "[your key]", write = TRUE)` ``` +4. You should now be able to use `authors_georef()` as described in the vignette. **WARNING:** `refsplitr` currently has a limit of 2500 API calls per day. We are working on including the ability for users to select their own limits. + +***Remember***: Your API key is unique and for you alone. Don't share it with other users or record it in a script file that is saved in a public repository. If need be you can visit the same website where you initially registered and generate a new key. + + ### 2.4. Data Visualization: Productivity and Collaboration `refsplitr` can generate five visualizations of scientific productivity and couthorship. The functions that generate these visualization use packages `rworldmap` (No. 1), `ggplot2` (Nos. 2,4,and 5), and `igraph` (No. 3). Advanced users of these packages can customize the visualizations to suit their needs. **WARNING**: The time required to render these plots is highly dependent on the number of authors in the dataset and the processing power of the computer on which analyses are being carried out. #### 2.4.1. Visualization 1: Authors per country. -The `plot_addresses_country()` makes a map whose shading of each country indicates the number of papers in the daraset written by authors based there. Note that there is no fractional authorship, e.g., if a paper is written by a team of authors based in the USA, Brazil, and Belgium, then that publication will count as one paper towards the total for each of these countries. +The `plot_addresses_country()` makes a map whose shading of each country indicates the number of papers in the data set written by authors based there. Note that there is no fractional authorship, e.g., if a paper is written by a team of authors based in the USA, Brazil, and Belgium, then that publication will count as one paper towards the total for each of these countries. The function has two arguments: @@ -320,8 +366,9 @@ The output of `plot_addresses_country()` is plot from the `rworldmap` package. a. Author records plotted on a world map -```r +``` r plot_addresses_country <- plot_addresses_country(example_georef$addresses) +#> Error: object 'example_georef' not found ``` **Figure 8:** Plot of the countries in which the authors in the dataset are based, with shading to indicate the number of authors based in each of country. @@ -343,9 +390,48 @@ The output of `authors_georef()` is a ggplot object. a. Mapped location of author institutions (global). -```r +``` r plot_addresses_points <- plot_addresses_points(example_georef$addresses) +#> Error: object 'example_georef' not found plot_addresses_points +#> function (data, mapCountry = NULL) +#> { +#> world <- ggplot2::map_data("world") +#> world <- world[world$region != "Antarctica", ] +#> if (!is.null(mapCountry)) { +#> world <- world[which(world$region == mapCountry), ] +#> if (mapCountry %in% c("UK")) +#> mapCountry <- c("england", "wales") +#> data <- data[data$country %in% tolower(mapCountry), ] +#> } +#> points <- data.frame(lat = as.numeric(as.character(data$lat)), +#> lon = as.numeric(as.character(data$lon))) +#> points <- points[!is.na(points$lat), ] +#> points <- points[!is.na(points$lon), ] +#> latmin <- min(world$lat) - 2 +#> latmax <- max(world$lat) + 2 +#> longmin <- min(world$long) - 2 +#> longmax <- max(world$long) + 2 +#> if (!is.null(mapCountry) && mapCountry == "USA") { +#> longmax <- max(world$long[world$long < 0]) + 2 +#> } +#> if (longmin < -180) +#> longmin <- -180 +#> if (longmax > 180) +#> longmax <- 180 +#> lon <- ggplot2::quo(lon) +#> lat <- ggplot2::quo(lat) +#> region <- ggplot2::quo(region) +#> ggplot2::ggplot() + ggplot2::geom_map(data = world, map = world, +#> ggplot2::aes(map_id = region), color = "gray", fill = "#7f7f7f", +#> linewidth = 0.05, alpha = 1/4) + ggplot2::geom_point(data = points, +#> ggplot2::aes(x = !!lon, y = !!lat)) + ggplot2::coord_map(ylim = c(latmin, +#> latmax), xlim = c(longmin, longmax)) + ggplot2::ylab("latitude") + +#> ggplot2::xlab("longitude") + ggplot2::theme_bw() + ggplot2::theme(panel.grid.major = ggplot2::element_blank(), +#> panel.grid.minor = ggplot2::element_blank(), axis.ticks = ggplot2::element_blank()) +#> } +#> +#> ``` b. Mapped location of author institutions (national). @@ -355,10 +441,49 @@ b. Mapped location of author institutions (national). **Figure 9:** Figure indicating the georefeenced locations of all authors in the dataset -```r +``` r plot_addresses_points <- plot_addresses_points(example_georef$addresses, mapCountry = "Brazil") +#> Error: object 'example_georef' not found plot_addresses_points +#> function (data, mapCountry = NULL) +#> { +#> world <- ggplot2::map_data("world") +#> world <- world[world$region != "Antarctica", ] +#> if (!is.null(mapCountry)) { +#> world <- world[which(world$region == mapCountry), ] +#> if (mapCountry %in% c("UK")) +#> mapCountry <- c("england", "wales") +#> data <- data[data$country %in% tolower(mapCountry), ] +#> } +#> points <- data.frame(lat = as.numeric(as.character(data$lat)), +#> lon = as.numeric(as.character(data$lon))) +#> points <- points[!is.na(points$lat), ] +#> points <- points[!is.na(points$lon), ] +#> latmin <- min(world$lat) - 2 +#> latmax <- max(world$lat) + 2 +#> longmin <- min(world$long) - 2 +#> longmax <- max(world$long) + 2 +#> if (!is.null(mapCountry) && mapCountry == "USA") { +#> longmax <- max(world$long[world$long < 0]) + 2 +#> } +#> if (longmin < -180) +#> longmin <- -180 +#> if (longmax > 180) +#> longmax <- 180 +#> lon <- ggplot2::quo(lon) +#> lat <- ggplot2::quo(lat) +#> region <- ggplot2::quo(region) +#> ggplot2::ggplot() + ggplot2::geom_map(data = world, map = world, +#> ggplot2::aes(map_id = region), color = "gray", fill = "#7f7f7f", +#> linewidth = 0.05, alpha = 1/4) + ggplot2::geom_point(data = points, +#> ggplot2::aes(x = !!lon, y = !!lat)) + ggplot2::coord_map(ylim = c(latmin, +#> latmax), xlim = c(longmin, longmax)) + ggplot2::ylab("latitude") + +#> ggplot2::xlab("longitude") + ggplot2::theme_bw() + ggplot2::theme(panel.grid.major = ggplot2::element_blank(), +#> panel.grid.minor = ggplot2::element_blank(), axis.ticks = ggplot2::element_blank()) +#> } +#> +#> ``` **Figure 10:** Figure indicating the georefeenced locations of authors in the dataset with institutional addresses in Brazil. @@ -377,9 +502,38 @@ This function has one output, a plot, built in `igraph`. a. Coauthorship networked based on the country in which coauthors are based. -```r +``` r plot_net_coauthor <- plot_net_coauthor(example_georef$addresses) +#> Error: object 'example_georef' not found plot_net_coauthor +#> function (data) +#> { +#> data <- data[!is.na(data$country), ] +#> linkages <- as.matrix(Matrix::sparseMatrix(dims = c(length(unique(data$country)), +#> length(unique(data$UT))), i = as.numeric(factor(data$country)), +#> j = as.numeric(factor(data$UT)), x = rep(1, length(data$country)))) +#> links <- matrix(data = linkages, nrow = length(unique(data$country)), +#> ncol = length(unique(data$UT))) +#> row.names(links) <- levels(factor(data$country)) +#> colnames(links) <- levels(factor(data$UT)) +#> linkages_countries <- links %*% base::t(links) +#> linkages_countries_net <- igraph::graph_from_adjacency_matrix(linkages_countries, +#> mode = "undirected", weighted = TRUE) +#> igraph::V(linkages_countries_net)$label <- igraph::V(linkages_countries_net)$name +#> igraph::V(linkages_countries_net)$label.color <- grDevices::rgb(0, +#> 0, 0.2, 0.5) +#> igraph::V(linkages_countries_net)$label.cex <- 0.5 +#> igraph::V(linkages_countries_net)$size <- 12 +#> igraph::V(linkages_countries_net)$frame.color <- NA +#> igraph::V(linkages_countries_net)$color <- grDevices::rgb(0, +#> 0.6, 0, 0.7) +#> linkages_countries_net <- igraph::simplify(linkages_countries_net) +#> co <- igraph::layout_with_fr(linkages_countries_net) +#> graphics::plot(linkages_countries_net, layout = co) +#> return(linkages_countries_net) +#> } +#> +#> ``` **Figure 11:** Plot of the coauthorship network for authors of articles in the dataset. @@ -410,9 +564,11 @@ The output of `plot_net_country()` is a list in the R workspace. The `$plot` ele a. Mapped coauthorship network based on the countries in which authors are based. -```r +``` r plot_net_country <- plot_net_country(example_georef$addresses) +#> Error: object 'example_georef' not found plot_net_country$plot +#> Error in plot_net_country$plot: object of type 'closure' is not subsettable ``` **Figure 12:** Map showing the coauthorship connections between countries. @@ -441,9 +597,11 @@ The output of `plot_net_address()` is a list in the R workspace. The `$plot` ele a. Coauthorship network based on the geographic locations of coauthor institutions. -```r +``` r plot_net_address <- plot_net_address(example_georef$addresses) +#> Error: object 'example_georef' not found plot_net_address$plot +#> Error in plot_net_address$plot: object of type 'closure' is not subsettable ``` **Figure 13:** Plot showing the network between individual author locations. @@ -475,13 +633,20 @@ Westgate, M. J. (2018). revtools: bibliographic data visualization for evidence ## **Appendix 1:** Guide to downloading reference records from the Web of Science. -plot of chunk unnamed-chunk-30plot of chunk unnamed-chunk-30 +
+plot of chunk unnamed-chunk-30 +

plot of chunk unnamed-chunk-30

+
+plot of chunk unnamed-chunk-30 +

plot of chunk unnamed-chunk-30

+
**Figure 13:** Web of Science Download Instructions ## **Appendix 2:** Web of Science Data Field Definitions + Table: Definition of column headings in the output of read_references()^1^. Most are [Web of Science Core Collection Field Tags](https://images.webofknowledge.com/images/help/WOS/hs_advanced_fieldtags.html) associated with different data types. |Column Heading |Definition | @@ -521,7 +686,9 @@ Table: Definition of column headings in the output of read_references()^1^. Most |refID |a unique identifier for each article in the dataset assigned by refnet | -^1^the following Web of Science data fields are only included if users select the `include_all=TRUE` option in `references_read()`: CC, CH, CL, CT, CY, DT, FX, GA, GE, J9, LA, PA, PI, PN, PS, RID, SU, VR, OA. + + +^1^the following Web of Science data fields are only included if users select the `include_all=TRUE` option in `references_read()`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR. ^2^Includes citations in the Web of Science Core Collection, BIOSIS Citation Index, Chinese Science Citation Database, Data Citation Index, Russian Science Citation Index, and SciELO Citation Index. @@ -560,11 +727,13 @@ Table: Information provided by the `authors_clean()` function to help users asse |PU |Publisher | + + ## **Appendix 4:** Overview of the `refsplitr` author name disambiguation algorithm. Name disambiguation is a complex process that is the subject of active research. There are a variety of approaches to disambiguation in large datasets; here we describe the algorithm for parsing author addresses and disambiguating author names with the `authors_clean()` function. -There are three primary difficulties in assigning authors to their products in bibliometric databases like the Web of Science. First, not all authors have a unique identifier such as an [ORCID iD (ORCID)](https://orcid.org/) or [ResearcherID (RID)](http://www.researcherid.com/Home.action?returnCode=ROUTER.Success&Init=Yes&SrcApp=CR&SID=6CmSYQQpa1DmjP5Mo3H). Second, an author's name can vary over time. It can also be reported inconsistently accross journals. For instance, while author last names are always reported, the first names might only be represented by initials and middle names (if authors have one) might not be reported or might only be stored as a middle initial. Finally, information in the "address" field can have inconsistent structures. In addition, [only after 2008 did Web of Sceince records directly link each author with their institutional address](https://images.webofknowledge.com/images/help/WOS/hp_full_record.html#dsy1030-TRS_addresses). As a result, for pre-2008 Web of Science records it can be difficult to relate each author with their institutional address (the same is true for emaail addresses). In these cases, we have no way to reliably match addresses to authors using the information in the reference record and therefore insert 'Could not be extracted' in the address field. This does not mean an address was not listed or cannot be assigned after manual inspection - only that there was no way to discern to which author the address belongs. Coupled with changes in author addresses as they change institutions, this the inconsistent or incomplete information associated with author addresses makes disambiguating names difficult. +There are three primary difficulties in assigning authors to their products in bibliometric databases like the Web of Science. First, not all authors have a unique identifier such as an [ORCID iD (ORCID)](https://orcid.org/) or [ResearcherID (RID)](http://www.researcherid.com/Home.action?returnCode=ROUTER.Success&Init=Yes&SrcApp=CR&SID=6CmSYQQpa1DmjP5Mo3H). Second, an author's name can vary over time. It can also be reported inconsistently across journals. For instance, while author last names are always reported, the first names might only be represented by initials and middle names (if authors have one) might not be reported or might only be stored as a middle initial. Finally, information in the "address" field can have inconsistent structures. In addition, [only after 2008 did Web of Sceince records directly link each author with their institutional address](https://images.webofknowledge.com/images/help/WOS/hp_full_record.html#dsy1030-TRS_addresses). As a result, for pre-2008 Web of Science records it can be difficult to relate each author with their institutional address (the same is true for email addresses). In these cases, we have no way to reliably match addresses to authors using the information in the reference record and therefore insert 'Could not be extracted' in the address field. This does not mean an address was not listed or cannot be assigned after manual inspection - only that there was no way to discern to which author the address belongs. Coupled with changes in author addresses as they change institutions, this the inconsistent or incomplete information associated with author addresses makes disambiguating names difficult. To address this we've created a process to identify clusters or common authors by iteratively building webs of authors using any available information to link groups together. In this way we do not require an entry to contain all relevant fields, but can nevertheless create likely groupings and then refine them by throwing out obvious spurious connections. In the case of authors with ORCID and RID numbers, identifying commonalities is quite easy, and we hope that authors will continue to sign up for these identifiers to facilitate disambiguation. @@ -575,7 +744,10 @@ The first step in our disambiguation process is matching all groups together wit Below is an example of how the algorithm processes a sample data set. -plot of chunk unnamed-chunk-33 +
+plot of chunk unnamed-chunk-33 +

plot of chunk unnamed-chunk-33

+
**Figure 14:** Example dataset. @@ -591,7 +763,10 @@ To lower the number of Type II errors we build a dataset of possible matches for **Entry 1.** In our test data we will start trying to match the first entry "Smith, J" in row 1. By subsetting with the above rules, we'd be matching the first row against rows 2, 3, 4, 5, 6, 7, 9: -plot of chunk unnamed-chunk-34 +
+plot of chunk unnamed-chunk-34 +

plot of chunk unnamed-chunk-34

+
**Figure 15:** Figure for entry 1 @@ -603,7 +778,10 @@ In our test data, there is only one piece of information we can match against - **Entry 3.** Row 3 has 2 unique identifying pieces of information: A middle initial and an email. This subset is smaller because we have a middle initial to filter out the Smith, J.L entries: -plot of chunk unnamed-chunk-35 +
+plot of chunk unnamed-chunk-35 +

plot of chunk unnamed-chunk-35

+
**Figure 16:** Figure for entry 3 @@ -611,7 +789,10 @@ Matching this information against our subset, the two possible matches are Row 2 **Entry 4** - This entry gets assigned groupID = 2 as well because it has a matching middle initial with Row 2 and Row 3: -plot of chunk unnamed-chunk-36 +
+plot of chunk unnamed-chunk-36 +

plot of chunk unnamed-chunk-36

+
**Figure 16:** Figure for entry 4 @@ -621,7 +802,10 @@ Matching this information against our subset, the two possible matches are Row 2 **Entry 7** - Entry 7 has one unique identifier: an email address. It gets matched to the entry in Row 3 and therefore is assigned groupID = 2. -plot of chunk unnamed-chunk-37 +
+plot of chunk unnamed-chunk-37 +

plot of chunk unnamed-chunk-37

+
**Figure 17:** Figure for entry 7 @@ -629,7 +813,10 @@ After these first 7 entries, we've correctly matched all likely 'Smith, Jon Karl **Entry 8** - This novel entry has two unique pieces of information: a middle initial and an ORCID. We know the ORCID did not match any previous entries, and the middle initial does not match up with any of the 'Smith' names in our record. -plot of chunk unnamed-chunk-38 +
+plot of chunk unnamed-chunk-38 +

plot of chunk unnamed-chunk-38

+
**Figure 18:** Figure for entry 8 @@ -639,13 +826,19 @@ Because there are no suitable matches using initial criteria, we instead match t **Entry 10** - This entry has no matching names and results in no change to the groupID number. -plot of chunk unnamed-chunk-39 +
+plot of chunk unnamed-chunk-39 +

plot of chunk unnamed-chunk-39

+
**Figure 19:** Figure for Entry 10 Thus our final results are: -plot of chunk unnamed-chunk-40 +
+plot of chunk unnamed-chunk-40 +

plot of chunk unnamed-chunk-40

+
**Figure 20:** Final Results. diff --git a/vignettes/refsplitr.Rmd.orig b/vignettes/refsplitr.Rmd.orig index ece6fc8..bd6cc11 100644 --- a/vignettes/refsplitr.Rmd.orig +++ b/vignettes/refsplitr.Rmd.orig @@ -51,7 +51,7 @@ The `refsplitr` package can either import a single Web of Science search result - **dir**: when loading a single file dir=FALSE, when loading multiple files dir=TRUE. If multiple files are processed `refsplitr` will identify and remove any duplicate reference records. -- **include_all**: Setting 'include_all=TRUE' will import all fields from the WOS record (see Appendix 2). The defailt is 'include_all=FALSE'. +- **include_all**: Setting 'include_all=TRUE' will import all fields from the WOS record (see Appendix 2). The default is 'include_all=FALSE'. The output of `references_read()` is an object in the R workspace. Each line of the output is a reference; the columns are the name of the .txt file from which the data were extracted, a unique id number assigned by `refsplitr` to each article, and the data from each field of the reference record (see Appendix 2 for a list of these data fields and their [Web of Science](https://images.webofknowledge.com/images/help/WOS/hs_wos_fieldtags.html) and RIS codes). This object is used by `refsplitr` in Step 2.2; we recommend also saving it as a .csv file in the "output" folder. @@ -144,14 +144,14 @@ The output of `authors_refine()` is an object in the R workspace. We recommend s #### Example -a. To accept the results of author disambiguation _**without**_ manual review (with **_default_** values for `sim_score` and `confidence`): +a. To accept the results of author disambiguation _**without**_ manual review (with default values for `sim_score` and `confidence`): ```{r, results="hide", message=FALSE} example_a_refined <- authors_refine(example_a_clean$review, example_a_clean$prelim) ``` -b. To accept the results of author disambiguation _**without**_ manual review (with **_user-modified_** values for `sim_score` and `confidence`): +b. To accept the results of author disambiguation _**without**_ manual review (with user-modified values for `sim_score` and `confidence`): ```{r, results="hide", message=FALSE} example_a_refined_mod <- authors_refine(example_a_clean$review, @@ -176,11 +176,11 @@ There are two kinds of potential disambiguation errors. First, diffent authors c These corrections are uploaded using the ```authors_refine()``` function, which then generates the 'refined' dataset used in analyses (see Section 2.2.3). -**WARNING:** Tests of our disambiguation algorithm indicate it is very accurate, but no method is perfect and errors are inevitable - especially as the number of author name variants increases. However, finding these errors becomes increasigly challegning as the number of references processed increases. This is because the number of names on the _review.csv file will increase as more authors, and hence author name variants, are identified. **We strongly recommend using [code we have written](https://github.com/embruna/refsplitr_simplify_authors_review) to streamline the process of reviewing the output of ```authors_refine()```**. This code divides the list of names to review into more manageable subgroups; any errors identified are then corrected on the "_review.csv" file. The code and instructions for using it are available at [https://github.com/embruna/refsplitr_simplify_authors_review](https://github.com/embruna/refsplitr_simplify_authors_review). +**WARNING:** Tests of our disambiguation algorithm indicate it is very accurate, but no method is perfect and errors are inevitable - especially as the number of author name variants increases. However, finding these errors becomes increasingly challenging as the number of references processed increases. This is because the number of names on the _review.csv file will increase as more authors, and hence author name variants, are identified. **We strongly recommend using [code we have written](https://github.com/embruna/refsplitr_simplify_authors_review) to streamline the process of reviewing the output of ```authors_refine()```**. This code divides the list of names to review into more manageable subgroups; any errors identified are then corrected on the "_review.csv" file. The code and instructions for using it are available at [https://github.com/embruna/refsplitr_simplify_authors_review](https://github.com/embruna/refsplitr_simplify_authors_review). #### Example -Figure 3 is an example of the the first few rows and columns of the 'review' element of `authors_clean()`. Each row is the author of a paper, with their name as it is on the author list (AF). Each author has been assignd a unique authorID number as well assigned to a groupID; the "match_name" column provides the name under which the algorithm has grouped all of an author’s putative name variants when assigning the groupID number. +Figure 3 is an example of the the first few rows and columns of the 'review' element of `authors_clean()`. Each row is the author of a paper, with their name as it is on the author list (AF). Each author has been assigned a unique authorID number as well assigned to a groupID; the "match_name" column provides the name under which the algorithm has grouped all of an author’s putative name variants when assigning the groupID number. ```{r, echo=FALSE, fig.pos="H", out.width="500px"} knitr::include_graphics("images/review_file.png", error = F) @@ -264,12 +264,14 @@ example_a_refined_mod <- authors_refine(example_a_corrected, ### 2.3. Georeferencing author institutions -Users can georeference author's institutions (latitude & longitude) using the `authors_georef()` function. This function has has two arguments: +Users can georeference author's institutions (latitude & longitude) using the `authors_georef()` function. This function has has three arguments: - **data**: The output created by `authors_refine()`. Must be an object. - **address_column**: A quoted character identifying the column name in which addresses are stored. This defaults to the `address` column from the `authors_refine()` output. +- ***google_api***: Defaults to `FALSE`. If `TRUE` georeferencing will be conducted with [`ggmap`](https://github.com/dkahle/ggmap) and the Google Maps API (paid service). If `FALSE` geocoding will be conducted with the [`tidygeocoder`](https://jessecambon.github.io/tidygeocoder/) package, which uses the Nominatum service to access [OpenStreetMap data](https://nominatim.org/). Note that when georeferencing large data sets, OSM requests users consider [downloading and installing the complete database](https://nominatim.org/release-docs/latest/admin/Installation/) to query locally instead of using the API. + The output of `authors_georef()` is a list with three elements: (1) `addresses` contains all records, (2) `missing_addresses` contains the records that could not be georeferenced, and (3) `not_missing_addresses` contains only the records with georeferenced addresses. #### **WARNINGS**: @@ -287,25 +289,53 @@ a. to georeference author institutions: ```{r,eval=FALSE} example_georef <-authors_georef(data=example_a_refined, - address_column = "address") + address_column = "address", + google_api = FALSE) ``` Note that while this function is being executed a message will be printed every time a location is geoprocessed. These messages can be suppressed by modifying the function call as follows: -```{r, eval=FALSE, results="hide", message=FALSE} +```{r, results="hide", message=FALSE} + +example_georef <-suppressMessages(authors_georef( + data=example_a_refined, + address_column="address", + google_api = FALSE)) + +``` + + +#### Registering with Google for an API key (NB: this is a paid service) + +1. Install and load the `ggmap` package + +```{r example2, eval=FALSE} + +install.packages("ggmap") +library(ggmap) -example_georef <-suppressMessages(authors_georef(data=example_a_refined, - address_column = "address")) +``` + +2. Register for a Google [Geocoding API](https://developers.google.com/maps/documentation/geocoding/overview) by following the instructions on the `READ ME` of the [`ggmap`](https://github.com/dkahle/ggmap) repository. + +3. Once you have your API key, add it to your `~/.Renviron` with the following: +```{r example3, eval=FALSE} +`ggmap::register_google(key = "[your key]", write = TRUE)` ``` +4. You should now be able to use `authors_georef()` as described in the vignette. **WARNING:** `refsplitr` currently has a limit of 2500 API calls per day. We are working on including the ability for users to select their own limits. + +***Remember***: Your API key is unique and for you alone. Don't share it with other users or record it in a script file that is saved in a public repository. If need be you can visit the same website where you initially registered and generate a new key. + + ### 2.4. Data Visualization: Productivity and Collaboration `refsplitr` can generate five visualizations of scientific productivity and couthorship. The functions that generate these visualization use packages `rworldmap` (No. 1), `ggplot2` (Nos. 2,4,and 5), and `igraph` (No. 3). Advanced users of these packages can customize the visualizations to suit their needs. **WARNING**: The time required to render these plots is highly dependent on the number of authors in the dataset and the processing power of the computer on which analyses are being carried out. #### 2.4.1. Visualization 1: Authors per country. -The `plot_addresses_country()` makes a map whose shading of each country indicates the number of papers in the daraset written by authors based there. Note that there is no fractional authorship, e.g., if a paper is written by a team of authors based in the USA, Brazil, and Belgium, then that publication will count as one paper towards the total for each of these countries. +The `plot_addresses_country()` makes a map whose shading of each country indicates the number of papers in the data set written by authors based there. Note that there is no fractional authorship, e.g., if a paper is written by a team of authors based in the USA, Brazil, and Belgium, then that publication will count as one paper towards the total for each of these countries. The function has two arguments: @@ -319,7 +349,7 @@ The output of `plot_addresses_country()` is plot from the `rworldmap` package. a. Author records plotted on a world map -```{r,eval=FALSE, fig.pos="H", out.width="500px",message=FALSE,results="hide"} +```{r, fig.pos="H", out.width="500px",message=FALSE,results="hide"} plot_addresses_country <- plot_addresses_country(example_georef$addresses) ``` @@ -341,7 +371,7 @@ The output of `authors_georef()` is a ggplot object. a. Mapped location of author institutions (global). -```{r, eval=FALSE, fig.pos="H", out.width="500px"} +```{r, fig.pos="H", out.width="500px"} plot_addresses_points <- plot_addresses_points(example_georef$addresses) plot_addresses_points ``` @@ -354,7 +384,7 @@ b. Mapped location of author institutions (national). **Figure 9:** Figure indicating the georefeenced locations of all authors in the dataset -```{r, eval=FALSE, fig.pos="H", out.width="500px"} +```{r, fig.pos="H", out.width="500px"} plot_addresses_points <- plot_addresses_points(example_georef$addresses, mapCountry = "Brazil") plot_addresses_points @@ -375,7 +405,7 @@ This function has one output, a plot, built in `igraph`. a. Coauthorship networked based on the country in which coauthors are based. -```{r, eval=FALSE, fig.pos="H", out.width="500px"} +```{r, fig.pos="H", out.width="500px"} plot_net_coauthor <- plot_net_coauthor(example_georef$addresses) plot_net_coauthor ``` @@ -407,7 +437,7 @@ The output of `plot_net_country()` is a list in the R workspace. The `$plot` ele a. Mapped coauthorship network based on the countries in which authors are based. -```{r fig.pos="H", eval=FALSE, out.width="500px"} +```{r fig.pos="H", out.width="500px"} plot_net_country <- plot_net_country(example_georef$addresses) plot_net_country$plot ``` @@ -437,7 +467,7 @@ The output of `plot_net_address()` is a list in the R workspace. The `$plot` ele a. Coauthorship network based on the geographic locations of coauthor institutions. -```{r, eval=FALSE, fig.pos="H", out.width="500px"} +```{r, fig.pos="H", out.width="500px"} plot_net_address <- plot_net_address(example_georef$addresses) plot_net_address$plot ``` @@ -483,10 +513,7 @@ knitr::include_graphics("images/app1-2.png", error = F) ## **Appendix 2:** Web of Science Data Field Definitions ```{r echo = FALSE, results = 'asis'} - -file1 = system.file("extdata", "App2_Table1.csv", package = "refsplitr") -App2Table1<-read.csv(file1,dec=".", header = TRUE, sep = ",", check.names=FALSE) - +App2Table1<-read.csv("App2_Table1.csv",dec=".", header = TRUE, sep = ",", check.names=FALSE) knitr::kable(App2Table1, caption = "Definition of column headings in the output of read_references()^1^. Most are [Web of Science Core Collection Field Tags](https://images.webofknowledge.com/images/help/WOS/hs_advanced_fieldtags.html) associated with different data types.") ``` @@ -501,10 +528,9 @@ knitr::kable(App2Table1, caption = "Definition of column headings in the output The information in the Table below is provided by the `authors_clean()` function to help users assess the validity of groupID assignments made by `refsplitr`'s disambiguation algorithm. However, finding any errors in disambiguation becomes increasigly challegning as the number of references processed increases. This is because the number of names on the _review.csv file will increase as more authors, and hence author name variants, are identified. We **strongly** recommend using [code we have written](https://github.com/embruna/refsplitr_simplify_authors_review) to streamline the process of reviewing the output of ```authors_refine()```. This code divides the list of names to review into more manageable subgroups; any errors identified are then corrected on the "_review.csv" file. The code and instructions for using it are available at [https://github.com/embruna/refsplitr_simplify_authors_review](https://github.com/embruna/refsplitr_simplify_authors_review). - ```{r echo = FALSE, results = 'asis'} -file2 = system.file("extdata", "App3_Table1.csv", package = "refsplitr") -App3Table1<-read.csv(file2,dec=".", header = TRUE, sep = ",", check.names=FALSE) +App3Table1<-read.csv("App3_Table1.csv",dec=".", header = TRUE, sep = ",", check.names=FALSE) + knitr::kable(App3Table1, caption = "Information provided by the `authors_clean()` function to help users assess the validity of groupID assignments made by `refsplitr`'s disambiguation algorithm.") ``` @@ -514,7 +540,7 @@ knitr::kable(App3Table1, caption = "Information provided by the `authors_clean() Name disambiguation is a complex process that is the subject of active research. There are a variety of approaches to disambiguation in large datasets; here we describe the algorithm for parsing author addresses and disambiguating author names with the `authors_clean()` function. -There are three primary difficulties in assigning authors to their products in bibliometric databases like the Web of Science. First, not all authors have a unique identifier such as an [ORCID iD (ORCID)](https://orcid.org/) or [ResearcherID (RID)](http://www.researcherid.com/Home.action?returnCode=ROUTER.Success&Init=Yes&SrcApp=CR&SID=6CmSYQQpa1DmjP5Mo3H). Second, an author's name can vary over time. It can also be reported inconsistently accross journals. For instance, while author last names are always reported, the first names might only be represented by initials and middle names (if authors have one) might not be reported or might only be stored as a middle initial. Finally, information in the "address" field can have inconsistent structures. In addition, [only after 2008 did Web of Sceince records directly link each author with their institutional address](https://images.webofknowledge.com/images/help/WOS/hp_full_record.html#dsy1030-TRS_addresses). As a result, for pre-2008 Web of Science records it can be difficult to relate each author with their institutional address (the same is true for emaail addresses). In these cases, we have no way to reliably match addresses to authors using the information in the reference record and therefore insert 'Could not be extracted' in the address field. This does not mean an address was not listed or cannot be assigned after manual inspection - only that there was no way to discern to which author the address belongs. Coupled with changes in author addresses as they change institutions, this the inconsistent or incomplete information associated with author addresses makes disambiguating names difficult. +There are three primary difficulties in assigning authors to their products in bibliometric databases like the Web of Science. First, not all authors have a unique identifier such as an [ORCID iD (ORCID)](https://orcid.org/) or [ResearcherID (RID)](http://www.researcherid.com/Home.action?returnCode=ROUTER.Success&Init=Yes&SrcApp=CR&SID=6CmSYQQpa1DmjP5Mo3H). Second, an author's name can vary over time. It can also be reported inconsistently across journals. For instance, while author last names are always reported, the first names might only be represented by initials and middle names (if authors have one) might not be reported or might only be stored as a middle initial. Finally, information in the "address" field can have inconsistent structures. In addition, [only after 2008 did Web of Sceince records directly link each author with their institutional address](https://images.webofknowledge.com/images/help/WOS/hp_full_record.html#dsy1030-TRS_addresses). As a result, for pre-2008 Web of Science records it can be difficult to relate each author with their institutional address (the same is true for email addresses). In these cases, we have no way to reliably match addresses to authors using the information in the reference record and therefore insert 'Could not be extracted' in the address field. This does not mean an address was not listed or cannot be assigned after manual inspection - only that there was no way to discern to which author the address belongs. Coupled with changes in author addresses as they change institutions, this the inconsistent or incomplete information associated with author addresses makes disambiguating names difficult. To address this we've created a process to identify clusters or common authors by iteratively building webs of authors using any available information to link groups together. In this way we do not require an entry to contain all relevant fields, but can nevertheless create likely groupings and then refine them by throwing out obvious spurious connections. In the case of authors with ORCID and RID numbers, identifying commonalities is quite easy, and we hope that authors will continue to sign up for these identifiers to facilitate disambiguation. From f8b05cb1fe5489d591d7c1b0eaf59b6f1b119987 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 21:18:28 -0400 Subject: [PATCH 27/34] removing case_when --- DESCRIPTION | 2 +- R/plot_net_country.R | 126 ++++++++++++++++++++++++------------------- 2 files changed, 73 insertions(+), 55 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c27d96b..71d1688 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ Description: Tools to parse and organize reference records downloaded from the the names of authors, geocode their locations, and generate/visualize coauthorship networks. This package has been peer-reviewed by rOpenSci (v. 1.0). -License: GNU General Public License v3.0 (GPL-3.0) +License: GPL-3.0 URL: https://github.com/ropensci/refsplitr, https://docs.ropensci.org/refsplitr/ BugReports: https://github.com/ropensci/refsplitr/issues Depends: diff --git a/R/plot_net_country.R b/R/plot_net_country.R index 1e5c2c3..d99b2fa 100644 --- a/R/plot_net_country.R +++ b/R/plot_net_country.R @@ -45,8 +45,7 @@ plot_net_country <- function(data, mapRegion = "world", lineAlpha = 0.5) { - requireNamespace(package = "dplyr", quietly = TRUE) - requireNamespace(package = "magrittr", quietly = TRUE) + fixable_countries<-data %>% dplyr::filter(is.na(country)==FALSE & is.na(lat)==TRUE) %>% @@ -58,44 +57,40 @@ plot_net_country <- function(data, data <- data[!is.na(data$country), ] - # names in WOS often don't match those in rworldmap' - data<-data %>% - dplyr::mutate(country=dplyr::case_when( - country == "usa" ~ "united states of america", - country == "united states" ~ "united states of america", - country == "serbia" ~ "republic of serbia", - country == "peoples r china" ~ "china", - country == "uk" ~ "united kingdom", - country == "england" ~ "united kingdom", - country == "scotland" ~ "united kingdom", - country == "wales" ~ "united kingdom", - country == "north ireland" ~ "united kingdom", - country == "cent afr republ" ~ "central african republic", - country == "cote ivoire" ~ "ivory coast", - country == "papua n guinea" ~ "papua new guinea", - country == "sao tome & prin" ~ "sao tome and principe", - country == "tanzania" ~ "united republic of tanzania", - country == "rep congo" ~ "republic of the congo", - country == "bahamas" ~ "the bahamas", - country == "dem rep congo" ~ "republic of the congo", - country == "rep congo" ~ "democratic republic of the congo", - country == "democratic republic of congo" ~ "democratic republic of the congo", - country == "fr polynesia" ~ "french polynesia", - country == "surinam" ~ "suriname", - country == "turks & caicos" ~ "turks and caicos islands", - country == "u arab emirates" ~ "united arab emirates", - # country == "curaçao" ~ "curacao", - country == "cura\u00e7ao" ~ "curacao", # to avoid fail for non-ascii characters - country == "libyan arab jamahiriya" ~ "libya", - country == "rhodesia" ~ "zimbabwe", - country == "russian federation" ~ "russia", - country == "hong kong" ~ "hong kong sar", - country == "hong kong s.a.r." ~ "hong kong sar", - country == "brunei darussalam" ~ "brunei", - country == "trinidade and tobago" ~ "trinidad and tobago", - .default = as.character(country) - )) + data$country[data$country=="usa"] <- "united states of america" + data$country[data$country=="united states"] <- "united states of america" + data$country[data$country=="serbia"] <- "republic of serbia" + data$country[data$country=="peoples r china"] <- "china" + data$country[data$country=="uk"] <- "united kingdom" + data$country[data$country=="england"] <- "united kingdom" + data$country[data$country=="scotland"] <- "united kingdom" + data$country[data$country=="wales"] <- "united kingdom" + data$country[data$country=="north ireland"] <- "united kingdom" + data$country[data$country=="cent afr republ"] <- "central african republic" + data$country[data$country=="cote ivoire"] <- "ivory coast" + data$country[data$country=="papua n guinea"] <- "papua new guinea" + data$country[data$country=="sao tome & prin"] <- "sao tome and principe" + data$country[data$country=="tanzania"] <- "united republic of tanzania" + data$country[data$country=="rep congo"] <- "republic of the congo" + data$country[data$country=="bahamas"] <- "the bahamas" + data$country[data$country=="dem rep congo"] <- "republic of the congo" + data$country[data$country=="rep congo"] <- "democratic republic of the congo" + data$country[data$country=="democratic republic of congo"] <- "democratic republic of the congo" + data$country[data$country=="fr polynesia"] <- "french polynesia" + data$country[data$country=="surinam"] <- "suriname" + data$country[data$country=="turks & caicos"] <- "turks and caicos islands" + data$country[data$country=="u arab emirates"] <- "united arab emirates" + data$country[data$country=="curaçao"] <- "curacao" + # to avoid fail for non-ascii characters + data$country[data$country=="cura\u00e7ao"] <- "curacao" + data$country[data$country=="libyan arab jamahiriya"] <- "libya" + data$country[data$country=="rhodesia"] <- "zimbabwe" + data$country[data$country=="russian federation"] <- "russia" + data$country[data$country=="hong kong"] <- "hong kong sar" + data$country[data$country=="hong kong s.a.r."] <- "hong kong sar" + data$country[data$country=="brunei darussalam"] <- "brunei" + data$country[data$country=="trinidade and tobago"] <- "trinidad and tobago" ## we could use a sparse matrix representation: linkages <- Matrix::spMatrix( @@ -174,21 +169,44 @@ plot_net_country <- function(data, # coords_df %>% filter(is.na(LAT)==TRUE) %>% distinct(ISO_A2) # need to add them manually - coords_df<- coords_df %>% - dplyr::mutate(LAT=dplyr::case_when( - ISO_A2 == "french guiana" ~ 3.9339, - ISO_A2 == "bonaire" ~ 12.2019, - ISO_A2 == "reunion" ~ -68.2624, - ISO_A2 == "palestine" ~ 31.9522, - .default = as.numeric(LAT) - )) %>% - dplyr::mutate(LON=dplyr::case_when( - ISO_A2 == "french guiana" ~ -53.1258, - ISO_A2 == "bonaire" ~ -68.2624, - ISO_A2 == "reunion" ~ 55.5364, - ISO_A2 == "palestine" ~ 35.2332, - .default = as.numeric(LON) - )) + # LAT + + + coords_df$LAT <- ifelse(ISO_A2 == "french guiana", + 3.9339, + coords_df$LAT) + + coords_df$LAT <- ifelse(ISO_A2 == "bonaire", + 12.2019, + coords_df$LAT) + + coords_df$LAT <- ifelse(ISO_A2 == "reunion", + -68.2624, + coords_df$LAT) + + coords_df$LAT <- ifelse(ISO_A2 == "palestine", + 31.9522, + coords_df$LAT) + + + # LON + + coords_df$LON <- ifelse(ISO_A2 == "french guiana", + -53.1258, + coords_df$LON) + + coords_df$LON <- ifelse(ISO_A2 == "bonaire", + -68.2624, + coords_df$LON) + + coords_df$LON <- ifelse(ISO_A2 == "reunion", + 55.5364, + coords_df$LON) + + coords_df$LON <- ifelse(ISO_A2 == "palestine", + 35.2332, + coords_df$LON) + ## One could also use ggplot to plot out the network geographically: From 76c94daadfa916eb4ab0c36760ad99d60eb4557a Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 21:31:51 -0400 Subject: [PATCH 28/34] correcting replacement --- R/plot_net_country.R | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/R/plot_net_country.R b/R/plot_net_country.R index d99b2fa..b54c408 100644 --- a/R/plot_net_country.R +++ b/R/plot_net_country.R @@ -47,11 +47,11 @@ plot_net_country <- function(data, - fixable_countries<-data %>% - dplyr::filter(is.na(country)==FALSE & is.na(lat)==TRUE) %>% - dplyr::select(refID,country) %>% - dplyr::group_by(refID,country) %>% - dplyr::tally() %>% + fixable_countries<-data |> + dplyr::filter(is.na(country)==FALSE & is.na(lat)==TRUE) |> + dplyr::select(refID,country) |> + dplyr::group_by(refID,country) |> + dplyr::tally() |> dplyr::arrange(n) @@ -172,38 +172,38 @@ plot_net_country <- function(data, # LAT - coords_df$LAT <- ifelse(ISO_A2 == "french guiana", + coords_df$LAT <- ifelse(coords_df$ISO_A2 == "french guiana", 3.9339, coords_df$LAT) - coords_df$LAT <- ifelse(ISO_A2 == "bonaire", + coords_df$LAT <- ifelse(coords_df$ISO_A2 == "bonaire", 12.2019, coords_df$LAT) - coords_df$LAT <- ifelse(ISO_A2 == "reunion", + coords_df$LAT <- ifelse(coords_df$ISO_A2 == "reunion", -68.2624, coords_df$LAT) - coords_df$LAT <- ifelse(ISO_A2 == "palestine", + coords_df$LAT <- ifelse(coords_df$ISO_A2 == "palestine", 31.9522, coords_df$LAT) # LON - coords_df$LON <- ifelse(ISO_A2 == "french guiana", + coords_df$LON <- ifelse(coords_df$ISO_A2 == "french guiana", -53.1258, coords_df$LON) - coords_df$LON <- ifelse(ISO_A2 == "bonaire", + coords_df$LON <- ifelse(coords_df$ISO_A2 == "bonaire", -68.2624, coords_df$LON) - coords_df$LON <- ifelse(ISO_A2 == "reunion", + coords_df$LON <- ifelse(coords_df$ISO_A2 == "reunion", 55.5364, coords_df$LON) - coords_df$LON <- ifelse(ISO_A2 == "palestine", + coords_df$LON <- ifelse(coords_df$ISO_A2 == "palestine", 35.2332, coords_df$LON) From 61b5349daaddf14708cc07c5f34f20f14aaf7da0 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 21:41:29 -0400 Subject: [PATCH 29/34] base-r pipe --- R/plot_net_country.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/plot_net_country.R b/R/plot_net_country.R index b54c408..4413d19 100644 --- a/R/plot_net_country.R +++ b/R/plot_net_country.R @@ -224,7 +224,7 @@ plot_net_country <- function(data, layoutCoordinates <- stats::na.omit(layoutCoordinates) - adjacencyList<- adjacencyList %>% + adjacencyList<- adjacencyList |> dplyr::mutate(country=dplyr::case_when( country == "V1" ~ NA, .default = as.character(country) @@ -232,7 +232,7 @@ plot_net_country <- function(data, - adjacencyList<- adjacencyList %>% + adjacencyList<- adjacencyList |> dplyr::mutate(countryA=dplyr::case_when( countryA == "V1" ~ NA, .default = as.character(countryA) From e945ee631949e1faa4331e2cc56b07c09d2f01f9 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 22:12:34 -0400 Subject: [PATCH 30/34] remiove non-ascii --- R/plot_net_country.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/plot_net_country.R b/R/plot_net_country.R index 4413d19..d8975be 100644 --- a/R/plot_net_country.R +++ b/R/plot_net_country.R @@ -80,10 +80,6 @@ plot_net_country <- function(data, data$country[data$country=="surinam"] <- "suriname" data$country[data$country=="turks & caicos"] <- "turks and caicos islands" data$country[data$country=="u arab emirates"] <- "united arab emirates" - data$country[data$country=="curaçao"] <- "curacao" - - # to avoid fail for non-ascii characters - data$country[data$country=="cura\u00e7ao"] <- "curacao" data$country[data$country=="libyan arab jamahiriya"] <- "libya" data$country[data$country=="rhodesia"] <- "zimbabwe" data$country[data$country=="russian federation"] <- "russia" @@ -92,12 +88,17 @@ plot_net_country <- function(data, data$country[data$country=="brunei darussalam"] <- "brunei" data$country[data$country=="trinidade and tobago"] <- "trinidad and tobago" + # to avoid fail for non-ascii characters + data$country[data$country=="cura\u00e7ao"] <- "curacao" + ## we could use a sparse matrix representation: linkages <- Matrix::spMatrix( nrow = length(unique(data$country)), ncol = length(unique(data$UT)), i = as.numeric(factor(data$country)), j = as.numeric(factor(data$UT)), + + x = rep(1, length(data$country)) ) From 1bf14e90498b997ea76b267bdb32c00a6f5fdf9a Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 22:24:56 -0400 Subject: [PATCH 31/34] replace size with linewidth line 416 --- R/plot_net_country.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plot_net_country.R b/R/plot_net_country.R index d8975be..85ddba7 100644 --- a/R/plot_net_country.R +++ b/R/plot_net_country.R @@ -413,7 +413,7 @@ plot_net_country <- function(data, ggplot2::geom_path( data = allEdges, ggplot2::aes(x = !!x, y = !!y, group = !!Group, - colour = !!Sequence, size = !!Sequence), alpha = lineAlpha + colour = !!Sequence, linewidth = !!Sequence), alpha = lineAlpha ) + ggplot2::geom_point( data = data.frame(layoutCoordinates), # Add nodes From ead7aabb28c71d1e09f22de03d2cb65fbf5450f7 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 22:25:29 -0400 Subject: [PATCH 32/34] update license --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 71d1688..fb385e9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ Description: Tools to parse and organize reference records downloaded from the the names of authors, geocode their locations, and generate/visualize coauthorship networks. This package has been peer-reviewed by rOpenSci (v. 1.0). -License: GPL-3.0 +License: GNU General Public License v3.0 URL: https://github.com/ropensci/refsplitr, https://docs.ropensci.org/refsplitr/ BugReports: https://github.com/ropensci/refsplitr/issues Depends: From 23eb10c229dcaa198d49dd04c5f634cfe0d16eca Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 22:50:51 -0400 Subject: [PATCH 33/34] license edit --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index fb385e9..b873658 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ Description: Tools to parse and organize reference records downloaded from the the names of authors, geocode their locations, and generate/visualize coauthorship networks. This package has been peer-reviewed by rOpenSci (v. 1.0). -License: GNU General Public License v3.0 +License: GPL-3 URL: https://github.com/ropensci/refsplitr, https://docs.ropensci.org/refsplitr/ BugReports: https://github.com/ropensci/refsplitr/issues Depends: From 2f39979c25830a258af476c91c432b9a2de42645 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 24 Mar 2025 22:57:26 -0400 Subject: [PATCH 34/34] tweaks to address R CMD --- DESCRIPTION | 1 - tests/extdata/BadHeader.txt | 4 ++-- tests/extdata/PubExample.txt | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b873658..abb7ecc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,7 +45,6 @@ Imports: Hmisc, igraph, Matrix, - magrittr, network, stringdist, rworldmap, diff --git a/tests/extdata/BadHeader.txt b/tests/extdata/BadHeader.txt index 37762bd..ce212a2 100644 --- a/tests/extdata/BadHeader.txt +++ b/tests/extdata/BadHeader.txt @@ -1,4 +1,4 @@ -XX Thomson Reuters Web of Science™ +XX Thomson Reuters Web of Science™ VR 1.0 PT J AU Sobrinho, MS @@ -345,4 +345,4 @@ BP 1129 EP 1129 UT WOS:000341179800029 PM 25190784 -ER +ER \ No newline at end of file diff --git a/tests/extdata/PubExample.txt b/tests/extdata/PubExample.txt index b410da1..6761896 100644 --- a/tests/extdata/PubExample.txt +++ b/tests/extdata/PubExample.txt @@ -1,4 +1,4 @@ -FN Thomson Reuters Web of Science™ +FN Thomson Reuters Web of Science™ VR 1.0 PT J AU Sobrinho, MS @@ -345,4 +345,4 @@ BP 1129 EP 1129 UT WOS:000341179800029 PM 25190784 -ER +ER \ No newline at end of file