Merge pull request #301 from globalgov/develop

v0.9.3
globalgov · May 6, 2024 · 49c3a8d · 49c3a8d
2 parents 818841c + cd81185
commit 49c3a8d
Show file tree

Hide file tree

Showing 13 changed files with 119 additions and 74 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: manydata
 Title: A Portal for Global Governance Data
-Version: 0.9.2
-Date: 2024-02-22
+Version: 0.9.3
+Date: 2024-05-06
 Authors@R:
     c(person(given = "James",
              family = "Hollway",
@@ -34,7 +34,7 @@ Depends:
     R (>= 3.5.0)
 Imports:
     dplyr,
-    messydates,
+    messydates (>= 0.4.1),
     purrr,
     stringr,
     usethis,

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# manydata 0.9.3
+
+## Connection
+
+* Updated `call_sources()` to be more flexible when gathering data from datacube documentation
+* Closed #279 by updating documentation across many packages to be compatible with `call_sources()`
+* Updated `compare_dimensions()` by fixing bugs related to dates and NA observations
+
 # manydata 0.9.2
 
 ## Package

diff --git a/R/call.R b/R/call.R
@@ -342,38 +342,28 @@ call_sources <- function(package, datacube, dataset = NULL,
   usethis::ui_info(paste0("Please see ",
                           cli::style_hyperlink(package, paste0("https://globalgov.github.io/", package)),
                           " for  more information."))
-  # get path
-  helptext <- utils::help(topic = as.character(datacube),
-                          package = as.character(package))
-  # get help file as text
-  helptext <- as.character(get_help_file(helptext))
-  # clean text
-  helptext <- stringr::str_remove_all(helptext,
-                                      "\\\n|\\{|\\}|\\\\tab$|\\\\cr$|^cc$")
-  helptext <- paste(stringr::str_trim(helptext[nzchar(helptext)]),
-                    collapse = " ")
-  # get names
-  names <- stringr::str_extract(helptext, "((following \\d datasets\\:)[^\\.]*)")
-  names <- trimws(unlist(strsplit(gsub("following \\d datasets\\:", "",
-                                       names), ", ")))
+  # get help file as clean(ish) text
+  helptext <- get_help_file(utils::help(topic = as.character(datacube),
+                                        package = as.character(package)))
+  # get names if one or more datasets are declared
+  if (!is.null(dataset)) {
+    names <- unlist(dataset)
+  } else {
+    names <- trimws(unlist(strsplit(gsub(
+      "following \\d datasets\\:", "", stringr::str_extract(
+        helptext, "((following \\d datasets\\:)[^\\.]*)")), ", ")))
+  }
   # keep only portions we are interested in
   helptext <- paste0(sub('.*</div>', '', helptext), " \\item")
   # get sections
-  sections <- c(unlist(stringr::str_extract_all(helptext,
-                                                "section \\w*")), "Source")
-  sections <- stringr::str_trim(gsub("section", "", sections))
+  sections <- .get_sections(helptext)
   # organize information into lists of list
   out <- list()
   for (i in names) {
-    out[i] <- stringr::str_extract_all(helptext,
-                                       paste0(i, "\\s*(.*?)\\s*\\\\item"))
-  }
-  # if one or more datasets are declared
-  if(!is.null(dataset)) {
-    out <- out[grepl(dataset, names(out))]
+    out[i] <- stringr::str_extract_all(helptext, paste0(i, "\\s*(.*?)\\s*\\\\item"))
   }
   # bind list
-  out <- data.frame(do.call(rbind, out))
+  out <- .check_and_bind_df(out, names)
   # clean observations
   out <- data.frame(t(apply(out, 1, function(x) {
     stringr::str_squish(gsub(
@@ -388,16 +378,12 @@ call_sources <- function(package, datacube, dataset = NULL,
                 please try the help file `?", package, "::", datacube, "`"))
   })
   rownames(out) <- gsub(":", "", names)
-  out <- data.frame(apply(out, 2, function(x) gsub("^: ", "", x)))
+  out[] <- lapply(out, function(x) gsub("^: ", "", x))
   # clean variable mapping
   out$Mapping <- unlist(lapply(out$Mapping, function(x) {
-    gsub("\\|", " | ",
-         gsub("\\_", " ",
-              gsub("\\(|\\)", "",
-                   gsub(" ", " - ",
-                        gsub("(\\S* \\S*) ","\\1|",
-                             gsub("\\s+(?=[^()]*\\))", "_",
-                                  gsub("('.*?')", "(\\1)", x), perl=TRUE))))))
+    gsub("\\|", " | ", gsub("\\_", " ", gsub("\\(|\\)", "", gsub(
+      " ", " - ", gsub("(\\S* \\S*) ","\\1|", gsub(
+        "\\s+(?=[^()]*\\))", "_", gsub("('.*?')", "(\\1)", x), perl=TRUE))))))
   }))
   # open preparation script if declared
   if (open_script == TRUE & !is.null(dataset)) {
@@ -414,7 +400,7 @@ call_sources <- function(package, datacube, dataset = NULL,
   # open codebook if declared
   if (open_codebook == TRUE & !is.null(dataset)) {
     url <- paste0("https://github.com/globalgov/", package, "/raw/develop/data-raw/",
-                  datacube, "/", dataset,"/", dataset)
+                  datacube, "/", dataset)
     tryCatch({
       utils::browseURL(paste0(url, "/", "OriginalCodebook.pdf"),
                        browser = getOption("browser"), encodeIfNeeded = FALSE)
@@ -449,7 +435,38 @@ get_help_file <- function(file) {
                                            datafile, compressed, envhook)
     fetch(key)
   }
-  lazyLoadDBexec(RdDB, fetchRdDB)
+  out <- as.character(lazyLoadDBexec(RdDB, fetchRdDB))
+  out <- stringr::str_remove_all(out, "\\\n|\\{|\\}|\\\\tab$|\\\\cr$|^cc$")
+  out <- paste(stringr::str_trim(out[nzchar(out)]), collapse = " ")
+  out
+}
+
+# Helper function to get sections
+.get_sections <- function(x) {
+  sections <- c(unlist(stringr::str_extract_all(x, "section \\w*")), "Source")
+  sections <- stringr::str_trim(gsub("section", "", sections))
+  sections
+}
+
+# Helper file for checking information
+.check_and_bind_df <- function(x, names) {
+  if (length(names) == 1) {
+    x <- data.frame(x[[1]])
+  } else {
+    if (length(unique(lengths(x))) > 1) {
+      for (i in names(x)) {
+        if (length(x[[i]]) < 3) {
+          if (all(!grepl("\\url", x[[i]]))) {
+            x[[i]] <- c(paste0(i, ": \\url NA \\item"), x[[i]])
+          } else if (all(!grepl("Variable Mapping", x[[i]]))) {
+            x[[i]] <- c(x[[i]][1], paste0(i, ": Variable Mapping \\tabular  \\emph from   \\emph to  NA NA \\item"), x[[i]][2])
+          } else x[[i]] <- c(x[[i]], paste0(i, ": NA \\item"))
+        }
+      }
+    }
+    x <- data.frame(do.call(rbind, x))
+  }
+  x
 }
 
 #' Call treaties from 'many' datasets

diff --git a/R/compare.R b/R/compare.R
@@ -31,19 +31,23 @@ compare_dimensions <- function(datacube, dataset = "all") {
   out <- do.call(rbind, lapply(datacube, function(x) {
     Observations <- nrow(x)
     Variables <- paste(names(x), collapse = ", ")
-    Earliest_Date <- suppressWarnings(min(unlist(purrr::map(x, function(y) {
-      ifelse(grepl("date", class(y), ignore.case = TRUE),
-             min(y, na.rm = TRUE), NA)
-    })), na.rm = TRUE))
-    Latest_Date <- suppressWarnings(max(unlist(purrr::map(x, function(y) {
-      ifelse(grepl("date", class(y), ignore.case = TRUE),
-             max(y, na.rm = TRUE), NA)
-    })), na.rm = TRUE))
+    Earliest_Date <- find_date(x, type = "earliest")
+    Latest_Date <- find_date(x, type = "latest")
     cbind(Observations, Variables, Earliest_Date, Latest_Date)
   }))
-  dplyr::as_tibble(cbind(names, out)) %>%
-    dplyr::mutate(Earliest_Date = messydates::as_messydate(Earliest_Date),
-                  Latest_Date = messydates::as_messydate(Latest_Date))
+  dplyr::as_tibble(cbind(names, out))
+}
+
+find_date <- function(x, type) {
+  out <- dplyr::select_if(x, vapply(x, function(y) 
+    class(y) == "mdate" | class(y) == "date",
+    FUN.VALUE = logical(1)))
+  if (type == "earliest") {
+    out <- min(as.Date(as_messydate(unlist(out)), min), na.rm = TRUE)
+  } else if (type == "latest") {
+    out <- max(as.Date(as_messydate(unlist(out)), max), na.rm = TRUE)
+  }
+  messydates::as_messydate(out)
 }
 
 #' Compare ranges of variables in 'many' data
@@ -88,22 +92,22 @@ compare_ranges <- function(datacube, dataset = "all", variable) {
     Variable <- names(x)
     Min <- unlist(lapply(x, function(y) {
       ifelse(grepl("date", class(y), ignore.case = TRUE),
-             as.character(as.Date(messydates::as_messydate(y), min)),
+             as.character(min(as.Date(messydates::as_messydate(y), min), na.rm = TRUE)),
              as.character(min(y, na.rm = TRUE)))
     }))
     Max <- unlist(lapply(x, function(y) {
       ifelse(grepl("date", class(y), ignore.case = TRUE),
-             as.character(as.Date(messydates::as_messydate(y), max)),
+             as.character(max(as.Date(messydates::as_messydate(y), max), na.rm = TRUE)),
              as.character(max(y, na.rm = TRUE)))
     }))
     Mean <- unlist(lapply(x, function(y) {
       ifelse(grepl("date", class(y), ignore.case = TRUE),
-             as.character(as.Date(messydates::as_messydate(y), mean)),
+             as.character(mean(as.Date(messydates::as_messydate(y), mean), na.rm = TRUE)),
              as.character(mean(y, na.rm = TRUE)))
     }))
     Median <- unlist(lapply(x, function(y) {
       ifelse(grepl("date", class(y), ignore.case = TRUE),
-             as.character(as.Date(messydates::as_messydate(y), median)),
+             as.character(stats::median(as.Date(messydates::as_messydate(y), median), na.rm = TRUE)),
              as.character(stats::median(y, na.rm = TRUE)))
     }))
     data.frame(cbind(Variable, Min, Max, Mean, Median))
@@ -245,8 +249,8 @@ compare_missing <- function(datacube, dataset = "all", variable = "all") {
 plot.compare_missing <- function(x, ...) {
   'Percent Missing' <- Variable <- Dataset <- NULL
   # Plot missing proportions for variables
-  ggplot(x, aes(reorder(Dataset, `Percent Missing`, decreasing = TRUE),
-                  reorder(Variable, `Percent Missing`))) +
+  ggplot2::ggplot(x, aes(reorder(Dataset, `Percent Missing`, decreasing = TRUE),
+                         reorder(Variable, `Percent Missing`))) +
     geom_tile(aes(fill = `Percent Missing`)) +
     scale_fill_gradient(low = "darkgreen", high = "red3", na.value = NA,
                         name = "Proportion\nof missing\nobservations") +

diff --git a/data-raw/emperors/UNRV/prepare-UNRV.R b/data-raw/emperors/UNRV/prepare-UNRV.R
@@ -19,15 +19,16 @@ UNRV <- tibble::as_tibble(UNRV) %>%
   dplyr::rename(ID = "Common Name",
          FullName = "Full Name/Imperial Name",
          Dynasty = "Dynasty/Class/Notes") %>%
-  dplyr::relocate(ID, Begin, End)
+  dplyr::relocate(ID, Begin, End) %>% 
+  dplyr::select(-Beg)
 # manydata includes several functions that should help cleaning
 # and standardising your data.
 # Please see the vignettes or website for more details.
 
 # Stage three: Connecting data
 # Next run the following line to make UNRV available
 # within the qPackage.
-manypkgs::export_data(UNRV, database = "emperors", URL = "https://www.unrv.com/government/emperor.php")
+manypkgs::export_data(UNRV, datacube = "emperors", URL = "https://www.unrv.com/government/emperor.php")
 # This function also does two additional things.
 # First, it creates a set of tests for this object to ensure adherence
 # to certain standards.You can hit Cmd-Shift-T (Mac) or Ctrl-Shift-T (Windows)

diff --git a/data-raw/emperors/britannica/prepare-britannica.R b/data-raw/emperors/britannica/prepare-britannica.R
@@ -36,7 +36,7 @@ britannica$reign_end[29] <- "0238-04"
 britannica$reign_end[30] <- "0238-07-29"
 britannica$reign_end[31] <- "0238-07-29"
 britannica$reign_end[44] <- "0276-09"
-britannica$reign_end[76] <- "0455-05-32"
+britannica$reign_end[76] <- "0455-05-31"
 britannica$reign_end[81] <- "0472-11"
 # Replace some unicode characters
 britannica$reign_end[1] <- "14 CE"
@@ -52,7 +52,7 @@ britannica <- as_tibble(britannica) %>%
 # Stage three: Connecting data
 # Next run the following line to make britannica available
 # within the qPackage.
-manypkgs::export_data(britannica, database = "emperors",
+manypkgs::export_data(britannica, datacube = "emperors",
                       URL = "https://www.britannica.com/topic/list-of-Roman-emperors-2043294")
 # This function also does two additional things.
 # First, it creates a set of tests for this object to ensure adherence

diff --git a/data-raw/emperors/wikipedia/prepare-wikipedia.R b/data-raw/emperors/wikipedia/prepare-wikipedia.R
@@ -53,7 +53,7 @@ for(i in c(17, 56)) { # death and reign end are estimates
   wikipedia$death[i] <-  paste0(wikipedia$death[i], "~")
   wikipedia$reign.end[i] <-  paste0(wikipedia$reign.end[i], "~") 
 }
-for(i in 62){ 
+for(i in 62) { 
   # birth and reign start are estimates
   wikipedia$birth[i] <-  paste0(wikipedia$birth[i], "~")
   wikipedia$reign.start[i] <-  paste0(wikipedia$reign.start[i], "~")
@@ -80,21 +80,22 @@ for(i in c(34, 35, 36, 37, 38, 39, 40, 41, 45, 46, 60)){
 # Let's also keep the year only for those dates which
 # the notes detail only year in certain.
 for(i in c(18, 22, 23)){ # reign start year only
-  wikipedia$reign.start[i] <-  stringr::str_extract(wikipedia$reign.start[i], "^[0-9]{4}")
+  wikipedia$reign.start[i] <-  stringr::str_extract(wikipedia$reign.start[i],
+                                                    "^[0-9]{3}")
 }
 # birth year only
-wikipedia$birth[24] <-  stringr::str_extract(wikipedia$birth[24], "^[0-9]{4}")
+wikipedia$birth[24] <-  stringr::str_extract(wikipedia$birth[24], "^[0-9]{3}")
 # Finally, some dates appear to be ranges.
 # `{messydates}` deals with ranges with a ".." separator.
 wikipedia$birth[20] <-  paste0(wikipedia$birth[20], "..", "0137-02-02")
 wikipedia$birth[66] <-  paste0(wikipedia$birth[66], "..", "0359-05-23")
 # Remove non-ASCII characters
-wikipedia <- apply(wikipedia, 2, stringi::stri_enc_toascii)
+wikipedia <- purrr::map(wikipedia, stringi::stri_enc_toascii)
 # Let's standardise dates and variable names
 wikipedia <- as_tibble(wikipedia) %>%
-  manydata::transmutate(ID = name,
-                        Begin = messydates::as_messydate(reign.start),
-                        End = messydates::as_messydate(reign.end)) %>%
+  transmutate(ID = name,
+              Begin = messydates::as_messydate(reign.start),
+              End = messydates::as_messydate(reign.end)) %>%
   dplyr::rename(FullName = name.full,
                 Birth = birth,
                 Death = death,
@@ -116,7 +117,7 @@ wikipedia <- as_tibble(wikipedia) %>%
 # Stage three: Connecting data
 # Next run the following line to make Wikipedia available
 # within the qPackage.
-manypkgs::export_data(wikipedia,  database = "emperors", URL = "https://github.com/zonination/emperors")
+manypkgs::export_data(wikipedia,  datacube = "emperors", URL = "https://github.com/zonination/emperors")
 # This function also does two additional things.
 # First, it creates a set of tests for this object to ensure adherence
 # to certain standards.You can hit Cmd-Shift-T (Mac) or Ctrl-Shift-T (Windows)

diff --git a/data/emperors.rda b/data/emperors.rda
diff --git a/tests/testthat/test_UNRV.R b/tests/testthat/test_UNRV.R
@@ -29,6 +29,8 @@ test_that("Columns with dates are standardized", {
 })
 
 test_that("dataset is arranged by date variable", {
+  skip_on_ci()
+  skip_on_cran()
   if (!is.null(emperors[["UNRV"]]$Begin)) {
     expect_true(emperors[["UNRV"]]$Begin[10] < emperors[["UNRV"]]$Begin[20])
   }

diff --git a/tests/testthat/test_britannica.R b/tests/testthat/test_britannica.R
@@ -29,6 +29,8 @@ test_that("Columns with dates are standardized", {
 })
 
 test_that("dataset is arranged by date variable", {
+  skip_on_ci()
+  skip_on_cran()
   if (!is.null(emperors[["britannica"]]$Begin)) {
     expect_true(emperors[["britannica"]]$Begin[10] <
                   emperors[["britannica"]]$Begin[20])

diff --git a/tests/testthat/test_call.R b/tests/testthat/test_call.R
@@ -41,6 +41,6 @@ test_that("call releases visualises historical
                                  milestone = c("Minor", "Patch"))
             testplot <- call_releases(testdf)
             expect_true(is.list(testplot))
-            expect_length(testplot, length(ggplot()))
+            expect_length(testplot, length(ggplot2::ggplot()))
             expect_named(testplot[1:3], c("data", "layers", "scales"))
 })
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
@@ -3,9 +3,9 @@
 test_that("plot for compare_categories returns the correct output format", {
   db <- plot(compare_categories(datacube = emperors, key = "ID"))
   expect_type(db, "list")
-  expect_length(db, length(ggplot()))
+  expect_length(db, 11)
   expect_true(ggplot2::is.ggplot(db))
-  expect_named(db, names(ggplot()))
+  expect_named(db, names(ggplot2::ggplot()))
 })
 
 test_that("compare_categories() returns the correct output format", {
@@ -33,6 +33,10 @@ test_that("compare_dimensions() returns the correct output format", {
   expect_type(db, "list")
   expect_length(db, 5)
   expect_s3_class(db, "tbl_df")
+  expect_equal(db$Earliest_Date,
+               c("-26-01-16", "-27-01-01", "-31-01-01"))
+  expect_equal(db$Latest_Date,
+               c("395-01-17", "518-12-31", "491-12-31"))
 })
 
 test_that("compare_ranges() returns the correct output format", {
@@ -42,18 +46,22 @@ test_that("compare_ranges() returns the correct output format", {
   expect_type(db, "list")
   expect_length(db, 6)
   expect_s3_class(db, "tbl_df")
+  expect_equal(db$Min[1], "-26-01-16")
+  expect_equal(db$Max[4], "518-12-31")
+  expect_equal(db$Mean[5], "275-04-23")
+  expect_equal(db$Median[6], "276-09-16")
 })
 
-test_that("compare_overlap() and plot_overlap() returns the correct output format", {
+test_that("compare_overlap() and return the correct output format", {
   db <- compare_overlap(emperors, key = "ID")
   expect_type(db, "list")
   expect_length(db, 2)
   expect_s3_class(db, "tbl_df")
   pl <- plot(db)
   expect_type(pl, "list")
-  expect_length(pl, length(ggplot()))
+  expect_length(pl, length(ggplot2::ggplot()))
   expect_true(ggplot2::is.ggplot(pl))
-  expect_named(pl, names(ggplot()))
+  expect_named(pl, names(ggplot2::ggplot()))
 })
 
 test_that("compare_missing() and plot_missing() returns the correct output format", {
@@ -63,7 +71,7 @@ test_that("compare_missing() and plot_missing() returns the correct output forma
   expect_s3_class(db, "tbl_df")
   pl <- plot(db)
   expect_type(pl, "list")
-  expect_length(pl, length(ggplot()))
+  expect_length(pl, length(ggplot2::ggplot()))
   expect_true(ggplot2::is.ggplot(pl))
-  expect_named(pl, names(ggplot()))
+  expect_named(pl, names(ggplot2::ggplot()))
 })