Skip to content

Commit

Permalink
Merge pull request #301 from globalgov/develop
Browse files Browse the repository at this point in the history
v0.9.3
  • Loading branch information
jhollway committed May 6, 2024
2 parents 818841c + cd81185 commit 49c3a8d
Show file tree
Hide file tree
Showing 13 changed files with 119 additions and 74 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: manydata
Title: A Portal for Global Governance Data
Version: 0.9.2
Date: 2024-02-22
Version: 0.9.3
Date: 2024-05-06
Authors@R:
c(person(given = "James",
family = "Hollway",
Expand Down Expand Up @@ -34,7 +34,7 @@ Depends:
R (>= 3.5.0)
Imports:
dplyr,
messydates,
messydates (>= 0.4.1),
purrr,
stringr,
usethis,
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# manydata 0.9.3

## Connection

* Updated `call_sources()` to be more flexible when gathering data from datacube documentation
* Closed #279 by updating documentation across many packages to be compatible with `call_sources()`
* Updated `compare_dimensions()` by fixing bugs related to dates and NA observations

# manydata 0.9.2

## Package
Expand Down
85 changes: 51 additions & 34 deletions R/call.R
Original file line number Diff line number Diff line change
Expand Up @@ -342,38 +342,28 @@ call_sources <- function(package, datacube, dataset = NULL,
usethis::ui_info(paste0("Please see ",
cli::style_hyperlink(package, paste0("https://globalgov.github.io/", package)),
" for more information."))
# get path
helptext <- utils::help(topic = as.character(datacube),
package = as.character(package))
# get help file as text
helptext <- as.character(get_help_file(helptext))
# clean text
helptext <- stringr::str_remove_all(helptext,
"\\\n|\\{|\\}|\\\\tab$|\\\\cr$|^cc$")
helptext <- paste(stringr::str_trim(helptext[nzchar(helptext)]),
collapse = " ")
# get names
names <- stringr::str_extract(helptext, "((following \\d datasets\\:)[^\\.]*)")
names <- trimws(unlist(strsplit(gsub("following \\d datasets\\:", "",
names), ", ")))
# get help file as clean(ish) text
helptext <- get_help_file(utils::help(topic = as.character(datacube),
package = as.character(package)))
# get names if one or more datasets are declared
if (!is.null(dataset)) {
names <- unlist(dataset)
} else {
names <- trimws(unlist(strsplit(gsub(
"following \\d datasets\\:", "", stringr::str_extract(
helptext, "((following \\d datasets\\:)[^\\.]*)")), ", ")))
}
# keep only portions we are interested in
helptext <- paste0(sub('.*</div>', '', helptext), " \\item")
# get sections
sections <- c(unlist(stringr::str_extract_all(helptext,
"section \\w*")), "Source")
sections <- stringr::str_trim(gsub("section", "", sections))
sections <- .get_sections(helptext)
# organize information into lists of list
out <- list()
for (i in names) {
out[i] <- stringr::str_extract_all(helptext,
paste0(i, "\\s*(.*?)\\s*\\\\item"))
}
# if one or more datasets are declared
if(!is.null(dataset)) {
out <- out[grepl(dataset, names(out))]
out[i] <- stringr::str_extract_all(helptext, paste0(i, "\\s*(.*?)\\s*\\\\item"))
}
# bind list
out <- data.frame(do.call(rbind, out))
out <- .check_and_bind_df(out, names)
# clean observations
out <- data.frame(t(apply(out, 1, function(x) {
stringr::str_squish(gsub(
Expand All @@ -388,16 +378,12 @@ call_sources <- function(package, datacube, dataset = NULL,
please try the help file `?", package, "::", datacube, "`"))
})
rownames(out) <- gsub(":", "", names)
out <- data.frame(apply(out, 2, function(x) gsub("^: ", "", x)))
out[] <- lapply(out, function(x) gsub("^: ", "", x))
# clean variable mapping
out$Mapping <- unlist(lapply(out$Mapping, function(x) {
gsub("\\|", " | ",
gsub("\\_", " ",
gsub("\\(|\\)", "",
gsub(" ", " - ",
gsub("(\\S* \\S*) ","\\1|",
gsub("\\s+(?=[^()]*\\))", "_",
gsub("('.*?')", "(\\1)", x), perl=TRUE))))))
gsub("\\|", " | ", gsub("\\_", " ", gsub("\\(|\\)", "", gsub(
" ", " - ", gsub("(\\S* \\S*) ","\\1|", gsub(
"\\s+(?=[^()]*\\))", "_", gsub("('.*?')", "(\\1)", x), perl=TRUE))))))
}))
# open preparation script if declared
if (open_script == TRUE & !is.null(dataset)) {
Expand All @@ -414,7 +400,7 @@ call_sources <- function(package, datacube, dataset = NULL,
# open codebook if declared
if (open_codebook == TRUE & !is.null(dataset)) {
url <- paste0("https://github.com/globalgov/", package, "/raw/develop/data-raw/",
datacube, "/", dataset,"/", dataset)
datacube, "/", dataset)
tryCatch({
utils::browseURL(paste0(url, "/", "OriginalCodebook.pdf"),
browser = getOption("browser"), encodeIfNeeded = FALSE)
Expand Down Expand Up @@ -449,7 +435,38 @@ get_help_file <- function(file) {
datafile, compressed, envhook)
fetch(key)
}
lazyLoadDBexec(RdDB, fetchRdDB)
out <- as.character(lazyLoadDBexec(RdDB, fetchRdDB))
out <- stringr::str_remove_all(out, "\\\n|\\{|\\}|\\\\tab$|\\\\cr$|^cc$")
out <- paste(stringr::str_trim(out[nzchar(out)]), collapse = " ")
out
}

# Helper function to get sections
.get_sections <- function(x) {
sections <- c(unlist(stringr::str_extract_all(x, "section \\w*")), "Source")
sections <- stringr::str_trim(gsub("section", "", sections))
sections
}

# Helper file for checking information
.check_and_bind_df <- function(x, names) {
if (length(names) == 1) {
x <- data.frame(x[[1]])
} else {
if (length(unique(lengths(x))) > 1) {
for (i in names(x)) {
if (length(x[[i]]) < 3) {
if (all(!grepl("\\url", x[[i]]))) {
x[[i]] <- c(paste0(i, ": \\url NA \\item"), x[[i]])
} else if (all(!grepl("Variable Mapping", x[[i]]))) {
x[[i]] <- c(x[[i]][1], paste0(i, ": Variable Mapping \\tabular \\emph from \\emph to NA NA \\item"), x[[i]][2])
} else x[[i]] <- c(x[[i]], paste0(i, ": NA \\item"))
}
}
}
x <- data.frame(do.call(rbind, x))
}
x
}

#' Call treaties from 'many' datasets
Expand Down
38 changes: 21 additions & 17 deletions R/compare.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,23 @@ compare_dimensions <- function(datacube, dataset = "all") {
out <- do.call(rbind, lapply(datacube, function(x) {
Observations <- nrow(x)
Variables <- paste(names(x), collapse = ", ")
Earliest_Date <- suppressWarnings(min(unlist(purrr::map(x, function(y) {
ifelse(grepl("date", class(y), ignore.case = TRUE),
min(y, na.rm = TRUE), NA)
})), na.rm = TRUE))
Latest_Date <- suppressWarnings(max(unlist(purrr::map(x, function(y) {
ifelse(grepl("date", class(y), ignore.case = TRUE),
max(y, na.rm = TRUE), NA)
})), na.rm = TRUE))
Earliest_Date <- find_date(x, type = "earliest")
Latest_Date <- find_date(x, type = "latest")
cbind(Observations, Variables, Earliest_Date, Latest_Date)
}))
dplyr::as_tibble(cbind(names, out)) %>%
dplyr::mutate(Earliest_Date = messydates::as_messydate(Earliest_Date),
Latest_Date = messydates::as_messydate(Latest_Date))
dplyr::as_tibble(cbind(names, out))
}

find_date <- function(x, type) {
out <- dplyr::select_if(x, vapply(x, function(y)
class(y) == "mdate" | class(y) == "date",
FUN.VALUE = logical(1)))
if (type == "earliest") {
out <- min(as.Date(as_messydate(unlist(out)), min), na.rm = TRUE)
} else if (type == "latest") {
out <- max(as.Date(as_messydate(unlist(out)), max), na.rm = TRUE)
}
messydates::as_messydate(out)
}

#' Compare ranges of variables in 'many' data
Expand Down Expand Up @@ -88,22 +92,22 @@ compare_ranges <- function(datacube, dataset = "all", variable) {
Variable <- names(x)
Min <- unlist(lapply(x, function(y) {
ifelse(grepl("date", class(y), ignore.case = TRUE),
as.character(as.Date(messydates::as_messydate(y), min)),
as.character(min(as.Date(messydates::as_messydate(y), min), na.rm = TRUE)),
as.character(min(y, na.rm = TRUE)))
}))
Max <- unlist(lapply(x, function(y) {
ifelse(grepl("date", class(y), ignore.case = TRUE),
as.character(as.Date(messydates::as_messydate(y), max)),
as.character(max(as.Date(messydates::as_messydate(y), max), na.rm = TRUE)),
as.character(max(y, na.rm = TRUE)))
}))
Mean <- unlist(lapply(x, function(y) {
ifelse(grepl("date", class(y), ignore.case = TRUE),
as.character(as.Date(messydates::as_messydate(y), mean)),
as.character(mean(as.Date(messydates::as_messydate(y), mean), na.rm = TRUE)),
as.character(mean(y, na.rm = TRUE)))
}))
Median <- unlist(lapply(x, function(y) {
ifelse(grepl("date", class(y), ignore.case = TRUE),
as.character(as.Date(messydates::as_messydate(y), median)),
as.character(stats::median(as.Date(messydates::as_messydate(y), median), na.rm = TRUE)),
as.character(stats::median(y, na.rm = TRUE)))
}))
data.frame(cbind(Variable, Min, Max, Mean, Median))
Expand Down Expand Up @@ -245,8 +249,8 @@ compare_missing <- function(datacube, dataset = "all", variable = "all") {
plot.compare_missing <- function(x, ...) {
'Percent Missing' <- Variable <- Dataset <- NULL
# Plot missing proportions for variables
ggplot(x, aes(reorder(Dataset, `Percent Missing`, decreasing = TRUE),
reorder(Variable, `Percent Missing`))) +
ggplot2::ggplot(x, aes(reorder(Dataset, `Percent Missing`, decreasing = TRUE),
reorder(Variable, `Percent Missing`))) +
geom_tile(aes(fill = `Percent Missing`)) +
scale_fill_gradient(low = "darkgreen", high = "red3", na.value = NA,
name = "Proportion\nof missing\nobservations") +
Expand Down
5 changes: 3 additions & 2 deletions data-raw/emperors/UNRV/prepare-UNRV.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,16 @@ UNRV <- tibble::as_tibble(UNRV) %>%
dplyr::rename(ID = "Common Name",
FullName = "Full Name/Imperial Name",
Dynasty = "Dynasty/Class/Notes") %>%
dplyr::relocate(ID, Begin, End)
dplyr::relocate(ID, Begin, End) %>%
dplyr::select(-Beg)
# manydata includes several functions that should help cleaning
# and standardising your data.
# Please see the vignettes or website for more details.

# Stage three: Connecting data
# Next run the following line to make UNRV available
# within the qPackage.
manypkgs::export_data(UNRV, database = "emperors", URL = "https://www.unrv.com/government/emperor.php")
manypkgs::export_data(UNRV, datacube = "emperors", URL = "https://www.unrv.com/government/emperor.php")
# This function also does two additional things.
# First, it creates a set of tests for this object to ensure adherence
# to certain standards.You can hit Cmd-Shift-T (Mac) or Ctrl-Shift-T (Windows)
Expand Down
4 changes: 2 additions & 2 deletions data-raw/emperors/britannica/prepare-britannica.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ britannica$reign_end[29] <- "0238-04"
britannica$reign_end[30] <- "0238-07-29"
britannica$reign_end[31] <- "0238-07-29"
britannica$reign_end[44] <- "0276-09"
britannica$reign_end[76] <- "0455-05-32"
britannica$reign_end[76] <- "0455-05-31"
britannica$reign_end[81] <- "0472-11"
# Replace some unicode characters
britannica$reign_end[1] <- "14 CE"
Expand All @@ -52,7 +52,7 @@ britannica <- as_tibble(britannica) %>%
# Stage three: Connecting data
# Next run the following line to make britannica available
# within the qPackage.
manypkgs::export_data(britannica, database = "emperors",
manypkgs::export_data(britannica, datacube = "emperors",
URL = "https://www.britannica.com/topic/list-of-Roman-emperors-2043294")
# This function also does two additional things.
# First, it creates a set of tests for this object to ensure adherence
Expand Down
17 changes: 9 additions & 8 deletions data-raw/emperors/wikipedia/prepare-wikipedia.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ for(i in c(17, 56)) { # death and reign end are estimates
wikipedia$death[i] <- paste0(wikipedia$death[i], "~")
wikipedia$reign.end[i] <- paste0(wikipedia$reign.end[i], "~")
}
for(i in 62){
for(i in 62) {
# birth and reign start are estimates
wikipedia$birth[i] <- paste0(wikipedia$birth[i], "~")
wikipedia$reign.start[i] <- paste0(wikipedia$reign.start[i], "~")
Expand All @@ -80,21 +80,22 @@ for(i in c(34, 35, 36, 37, 38, 39, 40, 41, 45, 46, 60)){
# Let's also keep the year only for those dates which
# the notes detail only year in certain.
for(i in c(18, 22, 23)){ # reign start year only
wikipedia$reign.start[i] <- stringr::str_extract(wikipedia$reign.start[i], "^[0-9]{4}")
wikipedia$reign.start[i] <- stringr::str_extract(wikipedia$reign.start[i],
"^[0-9]{3}")
}
# birth year only
wikipedia$birth[24] <- stringr::str_extract(wikipedia$birth[24], "^[0-9]{4}")
wikipedia$birth[24] <- stringr::str_extract(wikipedia$birth[24], "^[0-9]{3}")
# Finally, some dates appear to be ranges.
# `{messydates}` deals with ranges with a ".." separator.
wikipedia$birth[20] <- paste0(wikipedia$birth[20], "..", "0137-02-02")
wikipedia$birth[66] <- paste0(wikipedia$birth[66], "..", "0359-05-23")
# Remove non-ASCII characters
wikipedia <- apply(wikipedia, 2, stringi::stri_enc_toascii)
wikipedia <- purrr::map(wikipedia, stringi::stri_enc_toascii)
# Let's standardise dates and variable names
wikipedia <- as_tibble(wikipedia) %>%
manydata::transmutate(ID = name,
Begin = messydates::as_messydate(reign.start),
End = messydates::as_messydate(reign.end)) %>%
transmutate(ID = name,
Begin = messydates::as_messydate(reign.start),
End = messydates::as_messydate(reign.end)) %>%
dplyr::rename(FullName = name.full,
Birth = birth,
Death = death,
Expand All @@ -116,7 +117,7 @@ wikipedia <- as_tibble(wikipedia) %>%
# Stage three: Connecting data
# Next run the following line to make Wikipedia available
# within the qPackage.
manypkgs::export_data(wikipedia, database = "emperors", URL = "https://github.com/zonination/emperors")
manypkgs::export_data(wikipedia, datacube = "emperors", URL = "https://github.com/zonination/emperors")
# This function also does two additional things.
# First, it creates a set of tests for this object to ensure adherence
# to certain standards.You can hit Cmd-Shift-T (Mac) or Ctrl-Shift-T (Windows)
Expand Down
Binary file modified data/emperors.rda
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/testthat/test_UNRV.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ test_that("Columns with dates are standardized", {
})

test_that("dataset is arranged by date variable", {
skip_on_ci()
skip_on_cran()
if (!is.null(emperors[["UNRV"]]$Begin)) {
expect_true(emperors[["UNRV"]]$Begin[10] < emperors[["UNRV"]]$Begin[20])
}
Expand Down
2 changes: 2 additions & 0 deletions tests/testthat/test_britannica.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ test_that("Columns with dates are standardized", {
})

test_that("dataset is arranged by date variable", {
skip_on_ci()
skip_on_cran()
if (!is.null(emperors[["britannica"]]$Begin)) {
expect_true(emperors[["britannica"]]$Begin[10] <
emperors[["britannica"]]$Begin[20])
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_call.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,6 @@ test_that("call releases visualises historical
milestone = c("Minor", "Patch"))
testplot <- call_releases(testdf)
expect_true(is.list(testplot))
expect_length(testplot, length(ggplot()))
expect_length(testplot, length(ggplot2::ggplot()))
expect_named(testplot[1:3], c("data", "layers", "scales"))
})
22 changes: 15 additions & 7 deletions tests/testthat/test_compare.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
test_that("plot for compare_categories returns the correct output format", {
db <- plot(compare_categories(datacube = emperors, key = "ID"))
expect_type(db, "list")
expect_length(db, length(ggplot()))
expect_length(db, 11)
expect_true(ggplot2::is.ggplot(db))
expect_named(db, names(ggplot()))
expect_named(db, names(ggplot2::ggplot()))
})

test_that("compare_categories() returns the correct output format", {
Expand Down Expand Up @@ -33,6 +33,10 @@ test_that("compare_dimensions() returns the correct output format", {
expect_type(db, "list")
expect_length(db, 5)
expect_s3_class(db, "tbl_df")
expect_equal(db$Earliest_Date,
c("-26-01-16", "-27-01-01", "-31-01-01"))
expect_equal(db$Latest_Date,
c("395-01-17", "518-12-31", "491-12-31"))
})

test_that("compare_ranges() returns the correct output format", {
Expand All @@ -42,18 +46,22 @@ test_that("compare_ranges() returns the correct output format", {
expect_type(db, "list")
expect_length(db, 6)
expect_s3_class(db, "tbl_df")
expect_equal(db$Min[1], "-26-01-16")
expect_equal(db$Max[4], "518-12-31")
expect_equal(db$Mean[5], "275-04-23")
expect_equal(db$Median[6], "276-09-16")
})

test_that("compare_overlap() and plot_overlap() returns the correct output format", {
test_that("compare_overlap() and return the correct output format", {
db <- compare_overlap(emperors, key = "ID")
expect_type(db, "list")
expect_length(db, 2)
expect_s3_class(db, "tbl_df")
pl <- plot(db)
expect_type(pl, "list")
expect_length(pl, length(ggplot()))
expect_length(pl, length(ggplot2::ggplot()))
expect_true(ggplot2::is.ggplot(pl))
expect_named(pl, names(ggplot()))
expect_named(pl, names(ggplot2::ggplot()))
})

test_that("compare_missing() and plot_missing() returns the correct output format", {
Expand All @@ -63,7 +71,7 @@ test_that("compare_missing() and plot_missing() returns the correct output forma
expect_s3_class(db, "tbl_df")
pl <- plot(db)
expect_type(pl, "list")
expect_length(pl, length(ggplot()))
expect_length(pl, length(ggplot2::ggplot()))
expect_true(ggplot2::is.ggplot(pl))
expect_named(pl, names(ggplot()))
expect_named(pl, names(ggplot2::ggplot()))
})
Loading

0 comments on commit 49c3a8d

Please sign in to comment.