Loading Libraries
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Loading required package: viridisLite
Load and Prepare Data
# Load WHO child mortality dataset (assumes combined data file)
data <- read_csv("dataset_datascience.csv")
## Rows: 129564 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): REF_AREA, Geographic area, Regional group, Indicator, Sex, Wealth ...
## dbl (6): Reference Date, Observation Value, Lower Bound, Upper Bound, Stand...
## lgl (1): Definition
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View column names to verify structure
colnames(data)
## [1] "REF_AREA" "Geographic area" "Regional group"
## [4] "Indicator" "Sex" "Wealth Quintile"
## [7] "Series Name" "Series Year" "Reference Date"
## [10] "Observation Value" "Lower Bound" "Upper Bound"
## [13] "Standard Error" "Country notes" "Observation Status"
## [16] "Unit of measure" "Series Type" "Series Category"
## [19] "Series Method" "Age Group of Women" "Time Since First Birth"
## [22] "Definition" "Interval"
# Load shapefile for mapping, standardize names
eac_shape <- st_read("EAC_COUNTRIES.shp") %>%
mutate(Country = case_when(
NAME == "Democratic Republic of the Congo" ~ "Congo DRC",
NAME == "United Republic of Tanzania" ~ "Tanzania",
TRUE ~ NAME
))
## Reading layer `EAC_COUNTRIES' from data source
## `D:\Data Analysis Projects\CEMA DATA SCIENCE\EAC_COUNTRIES.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 8 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1359718 ymin: -1512113 xmax: 5723082 ymax: 1357225
## Projected CRS: WGS 84 / Pseudo-Mercator
print(unique(eac_shape$Country))
## [1] "Burundi" "Congo DRC" "Kenya" "Rwanda" "Tanzania"
## [6] "South Sudan" "Uganda" "Somalia"
# Define EAC countries
eac_countries <- c("Burundi", "Kenya", "Rwanda", "South Sudan",
"Tanzania", "Uganda", "Congo DRC", "Somalia")
# Standardize and filter for EAC countries only
mortality_data <- data %>%
mutate(
Country = case_when(
`Geographic area` %in% c("Congo, Dem. Rep.", "Democratic Republic of the Congo") ~ "Congo DRC",
`Geographic area` %in% c("Tanzania, United Rep.", "United Republic of Tanzania") ~ "Tanzania",
TRUE ~ `Geographic area`
),
Year = as.numeric(substr(`Series Year`, 1, 4))
) %>%
filter(Country %in% eac_countries)
mortality_data
## # A tibble: 5,795 × 25
## REF_AREA `Geographic area` `Regional group` Indicator Sex `Wealth Quintile`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 BDI Burundi <NA> Neonatal… Total Total
## 2 BDI Burundi <NA> Neonatal… Total Total
## 3 BDI Burundi <NA> Neonatal… Total Total
## 4 BDI Burundi <NA> Neonatal… Total Total
## 5 BDI Burundi <NA> Neonatal… Total Total
## 6 BDI Burundi <NA> Neonatal… Total Total
## 7 BDI Burundi <NA> Neonatal… Total Total
## 8 BDI Burundi <NA> Neonatal… Total Total
## 9 BDI Burundi <NA> Neonatal… Total Total
## 10 BDI Burundi <NA> Neonatal… Total Total
## # ℹ 5,785 more rows
## # ℹ 19 more variables: `Series Name` <chr>, `Series Year` <chr>,
## # `Reference Date` <dbl>, `Observation Value` <dbl>, `Lower Bound` <dbl>,
## # `Upper Bound` <dbl>, `Standard Error` <dbl>, `Country notes` <chr>,
## # `Observation Status` <chr>, `Unit of measure` <chr>, `Series Type` <chr>,
## # `Series Category` <chr>, `Series Method` <chr>, `Age Group of Women` <chr>,
## # `Time Since First Birth` <chr>, Definition <lgl>, Interval <dbl>, …
unique(data$Indicator)
## [1] "Neonatal mortality rate" "Under-five mortality rate"
# Separate neonatal and under-5 mortality
neonatal <- mortality_data %>%
filter(Indicator == "Neonatal mortality rate")
under5 <- mortality_data %>%
filter(Indicator == "Under-five mortality rate")
summary(neonatal)
## REF_AREA Geographic area Regional group Indicator
## Length:622 Length:622 Length:622 Length:622
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Sex Wealth Quintile Series Name Series Year
## Length:622 Length:622 Length:622 Length:622
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Reference Date Observation Value Lower Bound Upper Bound
## Min. :1954 Min. :11.10 Min. :11.07 Min. : 23.29
## 1st Qu.:1984 1st Qu.:28.04 1st Qu.:22.57 1st Qu.: 32.79
## Median :1996 Median :38.10 Median :32.35 Median : 45.69
## Mean :1995 Mean :38.00 Mean :31.52 Mean : 48.32
## 3rd Qu.:2008 3rd Qu.:45.62 3rd Qu.:38.70 3rd Qu.: 60.65
## Max. :2024 Max. :70.16 Max. :56.20 Max. :113.45
## NA's :197 NA's :197
## Standard Error Country notes Observation Status Unit of measure
## Min. : 1.472 Length:622 Length:622 Length:622
## 1st Qu.: 3.058 Class :character Class :character Class :character
## Median : 4.310 Mode :character Mode :character Mode :character
## Mean : 5.238
## 3rd Qu.: 6.186
## Max. :34.290
## NA's :427
## Series Type Series Category Series Method Age Group of Women
## Length:622 Length:622 Length:622 Length:622
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Time Since First Birth Definition Interval Country
## Length:622 Mode:logical Min. :1.000 Length:622
## Class :character NA's:622 1st Qu.:1.000 Class :character
## Mode :character Median :1.000 Mode :character
## Mean :2.258
## 3rd Qu.:5.000
## Max. :5.000
## NA's :1
## Year
## Min. :1977
## 1st Qu.:2014
## Median :2023
## Mean :2017
## 3rd Qu.:2023
## Max. :2023
##
summary(under5)
## REF_AREA Geographic area Regional group Indicator
## Length:5173 Length:5173 Length:5173 Length:5173
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Sex Wealth Quintile Series Name Series Year
## Length:5173 Length:5173 Length:5173 Length:5173
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Reference Date Observation Value Lower Bound Upper Bound
## Min. :1952 Min. : -2.835 Min. : 17.65 Min. : 38.94
## 1st Qu.:1988 1st Qu.: 93.067 1st Qu.: 63.34 1st Qu.: 99.99
## Median :1998 Median :144.200 Median :122.20 Median :174.22
## Mean :1997 Mean :146.088 Mean :123.83 Mean :183.20
## 3rd Qu.:2008 3rd Qu.:186.396 3rd Qu.:169.80 3rd Qu.:235.75
## Max. :2024 Max. :773.174 Max. :634.89 Max. :929.16
## NA's :2286 NA's :2286
## Standard Error Country notes Observation Status Unit of measure
## Min. : 0.149 Length:5173 Length:5173 Length:5173
## 1st Qu.: 8.700 Class :character Class :character Class :character
## Median : 11.200 Mode :character Mode :character Mode :character
## Mean : 12.768
## 3rd Qu.: 14.800
## Max. :180.790
## NA's :3461
## Series Type Series Category Series Method Age Group of Women
## Length:5173 Length:5173 Length:5173 Length:5173
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Time Since First Birth Definition Interval Country
## Length:5173 Mode:logical Min. :1.000 Length:5173
## Class :character NA's:5173 1st Qu.:1.000 Class :character
## Mode :character Median :1.000 Mode :character
## Mean :1.697
## 3rd Qu.:1.000
## Max. :5.000
## NA's :768
## Year
## Min. :1955
## 1st Qu.:2007
## Median :2023
## Mean :2014
## 3rd Qu.:2023
## Max. :2023
##
Merge with Latest Data and Add Labels
#Latest data
get_latest <- function(df) {
df %>%
group_by(Country) %>%
filter(Year == max(Year, na.rm = TRUE)) %>%
ungroup() %>%
select(Country, `Observation Value`, Year)
}
neonatal_latest <- get_latest(neonatal) %>% rename(NeonatalRate = `Observation Value`)
under5_latest <- get_latest(under5) %>% rename(Under5Rate = `Observation Value`)
neonatal_latest
## # A tibble: 425 × 3
## Country NeonatalRate Year
## <chr> <dbl> <dbl>
## 1 Burundi 49.7 2023
## 2 Burundi 49.9 2023
## 3 Burundi 50.0 2023
## 4 Burundi 49.9 2023
## 5 Burundi 49.9 2023
## 6 Burundi 50.0 2023
## 7 Burundi 49.9 2023
## 8 Burundi 49.7 2023
## 9 Burundi 50.7 2023
## 10 Burundi 49.3 2023
## # ℹ 415 more rows
under5_latest
## # A tibble: 2,887 × 3
## Country Under5Rate Year
## <chr> <dbl> <dbl>
## 1 Burundi 238. 2023
## 2 Burundi 240. 2023
## 3 Burundi 242. 2023
## 4 Burundi 244. 2023
## 5 Burundi 245. 2023
## 6 Burundi 246. 2023
## 7 Burundi 246. 2023
## 8 Burundi 314. 2023
## 9 Burundi 247. 2023
## 10 Burundi 246. 2023
## # ℹ 2,877 more rows
# Merge with shapefile
merge_shape <- function(shape, data, rate_col) {
df <- left_join(shape, data, by = "Country")
centroid_coords <- st_centroid(df) %>% st_coordinates() %>% as.data.frame()
df$lon <- centroid_coords$X
df$lat <- centroid_coords$Y
df$Label <- ifelse(is.na(df[[rate_col]]), paste0(df$Country, "\nNA"), df$Country)
return(df)
}
neonatal_map <- merge_shape(eac_shape, neonatal_latest, "NeonatalRate")
## Warning: st_centroid assumes attributes are constant over geometries
under5_map <- merge_shape(eac_shape, under5_latest, "Under5Rate")
## Warning: st_centroid assumes attributes are constant over geometries
#Plotting Choropleth Maps
plot_choropleth <- function(df, rate_col, title) {
ggplot(df) +
geom_sf(aes(fill = .data[[rate_col]]), color = "white") +
geom_text(aes(x = lon, y = lat, label = Label), size = 3, color = "black") +
scale_fill_viridis_c(
name = "Deaths/1,000",
option = "C",
na.value = "lightgrey",
direction = -1,
breaks = pretty(range(df[[rate_col]], na.rm = TRUE), n = 5)
) +
labs(title = title) +
theme_minimal()+
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
legend.position = "right"
)
}
plot_choropleth(neonatal_map, "NeonatalRate", "Latest Neonatal Mortality in EAC")
plot_choropleth(under5_map, "Under5Rate", "Latest Under-Five Mortality in EAC")
### Analyzing Trend Over Time
``` r
#Preparing Trend Data
prepare_trend <- function(df, rate_col) {
df %>%
group_by(Year) %>%
mutate(AvgRate = mean(.data[[rate_col]], na.rm = TRUE)) %>%
ungroup()
}
# Call the function with the correct rate column for each dataset
neonatal_trend <- prepare_trend(neonatal, "Observation Value")
under5_trend <- prepare_trend(under5, "Observation Value")
neonatal_trend
## # A tibble: 622 × 26
## REF_AREA `Geographic area` `Regional group` Indicator Sex `Wealth Quintile`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 BDI Burundi <NA> Neonatal… Total Total
## 2 BDI Burundi <NA> Neonatal… Total Total
## 3 BDI Burundi <NA> Neonatal… Total Total
## 4 BDI Burundi <NA> Neonatal… Total Total
## 5 BDI Burundi <NA> Neonatal… Total Total
## 6 BDI Burundi <NA> Neonatal… Total Total
## 7 BDI Burundi <NA> Neonatal… Total Total
## 8 BDI Burundi <NA> Neonatal… Total Total
## 9 BDI Burundi <NA> Neonatal… Total Total
## 10 BDI Burundi <NA> Neonatal… Total Total
## # ℹ 612 more rows
## # ℹ 20 more variables: `Series Name` <chr>, `Series Year` <chr>,
## # `Reference Date` <dbl>, `Observation Value` <dbl>, `Lower Bound` <dbl>,
## # `Upper Bound` <dbl>, `Standard Error` <dbl>, `Country notes` <chr>,
## # `Observation Status` <chr>, `Unit of measure` <chr>, `Series Type` <chr>,
## # `Series Category` <chr>, `Series Method` <chr>, `Age Group of Women` <chr>,
## # `Time Since First Birth` <chr>, Definition <lgl>, Interval <dbl>, …
under5_trend
## # A tibble: 5,173 × 26
## REF_AREA `Geographic area` `Regional group` Indicator Sex `Wealth Quintile`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 BDI Burundi <NA> Under-fi… Fema… Total
## 2 BDI Burundi <NA> Under-fi… Fema… Total
## 3 BDI Burundi <NA> Under-fi… Fema… Total
## 4 BDI Burundi <NA> Under-fi… Fema… Total
## 5 BDI Burundi <NA> Under-fi… Fema… Total
## 6 BDI Burundi <NA> Under-fi… Fema… Total
## 7 BDI Burundi <NA> Under-fi… Fema… Total
## 8 BDI Burundi <NA> Under-fi… Fema… Total
## 9 BDI Burundi <NA> Under-fi… Fema… Total
## 10 BDI Burundi <NA> Under-fi… Fema… Total
## # ℹ 5,163 more rows
## # ℹ 20 more variables: `Series Name` <chr>, `Series Year` <chr>,
## # `Reference Date` <dbl>, `Observation Value` <dbl>, `Lower Bound` <dbl>,
## # `Upper Bound` <dbl>, `Standard Error` <dbl>, `Country notes` <chr>,
## # `Observation Status` <chr>, `Unit of measure` <chr>, `Series Type` <chr>,
## # `Series Category` <chr>, `Series Method` <chr>, `Age Group of Women` <chr>,
## # `Time Since First Birth` <chr>, Definition <lgl>, Interval <dbl>, …
plot_trends <- function(df, rate_col, title) {
ggplot(df, aes(x = Year)) +
geom_line(aes(y = AvgRate), color = "#0072B2", linetype = "dashed", size = 1.2) +
geom_jitter(aes(y = .data[[rate_col]], color = Country), width = 0.3, size = 2, alpha = 0.6) +
scale_color_viridis_d(name = "Country") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
labs(title = title, y = "Deaths/1,000 live births", x = "Year") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5)
)
}
plot_trends(neonatal_trend, "Observation Value", "Neonatal Mortality Trends in EAC")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot_trends(under5_trend, "Observation Value", "Under-Five Mortality Trends in EAC")
Finding Country with the Highest Mortality Rates in EAC
#Highest Under 5
cat("Country with Highest Under-5 Mortality:\n")
## Country with Highest Under-5 Mortality:
under5_latest %>%
arrange(desc(Under5Rate)) %>%
select(Country, Under5Rate,Year)
## # A tibble: 2,887 × 3
## Country Under5Rate Year
## <chr> <dbl> <dbl>
## 1 South Sudan 773. 2023
## 2 South Sudan 767. 2023
## 3 South Sudan 761. 2023
## 4 Somalia 480. 2023
## 5 Somalia 473. 2023
## 6 Somalia 468. 2023
## 7 Somalia 466. 2023
## 8 Somalia 464. 2023
## 9 Somalia 460. 2023
## 10 Somalia 456. 2023
## # ℹ 2,877 more rows
#Highest Neonatal
cat("\nCountry with Highest Neonatal Mortality:\n")
##
## Country with Highest Neonatal Mortality:
neonatal_latest %>%
arrange(desc(NeonatalRate)) %>%
select(Country, NeonatalRate, Year)
## # A tibble: 425 × 3
## Country NeonatalRate Year
## <chr> <dbl> <dbl>
## 1 South Sudan 65.8 2023
## 2 South Sudan 65.2 2023
## 3 South Sudan 64.8 2023
## 4 South Sudan 64.6 2023
## 5 South Sudan 64.1 2023
## 6 Rwanda 62.8 2023
## 7 Rwanda 62.8 2023
## 8 Rwanda 62.7 2023
## 9 Rwanda 62.5 2023
## 10 South Sudan 62.4 2023
## # ℹ 415 more rows