Loading Libraries

library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Loading required package: viridisLite

Load and Prepare Data

# Load WHO child mortality dataset (assumes combined data file)
data <- read_csv("dataset_datascience.csv")
## Rows: 129564 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): REF_AREA, Geographic area, Regional group, Indicator, Sex, Wealth ...
## dbl  (6): Reference Date, Observation Value, Lower Bound, Upper Bound, Stand...
## lgl  (1): Definition
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View column names to verify structure
colnames(data)
##  [1] "REF_AREA"               "Geographic area"        "Regional group"        
##  [4] "Indicator"              "Sex"                    "Wealth Quintile"       
##  [7] "Series Name"            "Series Year"            "Reference Date"        
## [10] "Observation Value"      "Lower Bound"            "Upper Bound"           
## [13] "Standard Error"         "Country notes"          "Observation Status"    
## [16] "Unit of measure"        "Series Type"            "Series Category"       
## [19] "Series Method"          "Age Group of Women"     "Time Since First Birth"
## [22] "Definition"             "Interval"
# Load shapefile for mapping, standardize names
eac_shape <- st_read("EAC_COUNTRIES.shp") %>%
  mutate(Country = case_when(
    NAME == "Democratic Republic of the Congo" ~ "Congo DRC",
    NAME == "United Republic of Tanzania" ~ "Tanzania",
    TRUE ~ NAME
  ))
## Reading layer `EAC_COUNTRIES' from data source 
##   `D:\Data Analysis Projects\CEMA DATA SCIENCE\EAC_COUNTRIES.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 8 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 1359718 ymin: -1512113 xmax: 5723082 ymax: 1357225
## Projected CRS: WGS 84 / Pseudo-Mercator
print(unique(eac_shape$Country))
## [1] "Burundi"     "Congo DRC"   "Kenya"       "Rwanda"      "Tanzania"   
## [6] "South Sudan" "Uganda"      "Somalia"
# Define EAC countries
eac_countries <- c("Burundi", "Kenya", "Rwanda", "South Sudan", 
                   "Tanzania", "Uganda", "Congo DRC", "Somalia")

# Standardize and filter for EAC countries only
mortality_data <- data %>%
  mutate(
    Country = case_when(
      `Geographic area` %in% c("Congo, Dem. Rep.", "Democratic Republic of the Congo") ~ "Congo DRC",
      `Geographic area` %in% c("Tanzania, United Rep.", "United Republic of Tanzania") ~ "Tanzania",
      TRUE ~ `Geographic area`
    ),
    Year = as.numeric(substr(`Series Year`, 1, 4))
  ) %>%
  filter(Country %in% eac_countries)
mortality_data
## # A tibble: 5,795 × 25
##    REF_AREA `Geographic area` `Regional group` Indicator Sex   `Wealth Quintile`
##    <chr>    <chr>             <chr>            <chr>     <chr> <chr>            
##  1 BDI      Burundi           <NA>             Neonatal… Total Total            
##  2 BDI      Burundi           <NA>             Neonatal… Total Total            
##  3 BDI      Burundi           <NA>             Neonatal… Total Total            
##  4 BDI      Burundi           <NA>             Neonatal… Total Total            
##  5 BDI      Burundi           <NA>             Neonatal… Total Total            
##  6 BDI      Burundi           <NA>             Neonatal… Total Total            
##  7 BDI      Burundi           <NA>             Neonatal… Total Total            
##  8 BDI      Burundi           <NA>             Neonatal… Total Total            
##  9 BDI      Burundi           <NA>             Neonatal… Total Total            
## 10 BDI      Burundi           <NA>             Neonatal… Total Total            
## # ℹ 5,785 more rows
## # ℹ 19 more variables: `Series Name` <chr>, `Series Year` <chr>,
## #   `Reference Date` <dbl>, `Observation Value` <dbl>, `Lower Bound` <dbl>,
## #   `Upper Bound` <dbl>, `Standard Error` <dbl>, `Country notes` <chr>,
## #   `Observation Status` <chr>, `Unit of measure` <chr>, `Series Type` <chr>,
## #   `Series Category` <chr>, `Series Method` <chr>, `Age Group of Women` <chr>,
## #   `Time Since First Birth` <chr>, Definition <lgl>, Interval <dbl>, …
unique(data$Indicator)
## [1] "Neonatal mortality rate"   "Under-five mortality rate"
# Separate neonatal and under-5 mortality
neonatal <- mortality_data %>%
  filter(Indicator == "Neonatal mortality rate")

under5 <- mortality_data %>%
  filter(Indicator == "Under-five mortality rate")


summary(neonatal)
##    REF_AREA         Geographic area    Regional group      Indicator        
##  Length:622         Length:622         Length:622         Length:622        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      Sex            Wealth Quintile    Series Name        Series Year       
##  Length:622         Length:622         Length:622         Length:622        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Reference Date Observation Value  Lower Bound     Upper Bound    
##  Min.   :1954   Min.   :11.10     Min.   :11.07   Min.   : 23.29  
##  1st Qu.:1984   1st Qu.:28.04     1st Qu.:22.57   1st Qu.: 32.79  
##  Median :1996   Median :38.10     Median :32.35   Median : 45.69  
##  Mean   :1995   Mean   :38.00     Mean   :31.52   Mean   : 48.32  
##  3rd Qu.:2008   3rd Qu.:45.62     3rd Qu.:38.70   3rd Qu.: 60.65  
##  Max.   :2024   Max.   :70.16     Max.   :56.20   Max.   :113.45  
##                                   NA's   :197     NA's   :197     
##  Standard Error   Country notes      Observation Status Unit of measure   
##  Min.   : 1.472   Length:622         Length:622         Length:622        
##  1st Qu.: 3.058   Class :character   Class :character   Class :character  
##  Median : 4.310   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 5.238                                                           
##  3rd Qu.: 6.186                                                           
##  Max.   :34.290                                                           
##  NA's   :427                                                              
##  Series Type        Series Category    Series Method      Age Group of Women
##  Length:622         Length:622         Length:622         Length:622        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Time Since First Birth Definition        Interval       Country         
##  Length:622             Mode:logical   Min.   :1.000   Length:622        
##  Class :character       NA's:622       1st Qu.:1.000   Class :character  
##  Mode  :character                      Median :1.000   Mode  :character  
##                                        Mean   :2.258                     
##                                        3rd Qu.:5.000                     
##                                        Max.   :5.000                     
##                                        NA's   :1                         
##       Year     
##  Min.   :1977  
##  1st Qu.:2014  
##  Median :2023  
##  Mean   :2017  
##  3rd Qu.:2023  
##  Max.   :2023  
## 
summary(under5)
##    REF_AREA         Geographic area    Regional group      Indicator        
##  Length:5173        Length:5173        Length:5173        Length:5173       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      Sex            Wealth Quintile    Series Name        Series Year       
##  Length:5173        Length:5173        Length:5173        Length:5173       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Reference Date Observation Value  Lower Bound      Upper Bound    
##  Min.   :1952   Min.   : -2.835   Min.   : 17.65   Min.   : 38.94  
##  1st Qu.:1988   1st Qu.: 93.067   1st Qu.: 63.34   1st Qu.: 99.99  
##  Median :1998   Median :144.200   Median :122.20   Median :174.22  
##  Mean   :1997   Mean   :146.088   Mean   :123.83   Mean   :183.20  
##  3rd Qu.:2008   3rd Qu.:186.396   3rd Qu.:169.80   3rd Qu.:235.75  
##  Max.   :2024   Max.   :773.174   Max.   :634.89   Max.   :929.16  
##                                   NA's   :2286     NA's   :2286    
##  Standard Error    Country notes      Observation Status Unit of measure   
##  Min.   :  0.149   Length:5173        Length:5173        Length:5173       
##  1st Qu.:  8.700   Class :character   Class :character   Class :character  
##  Median : 11.200   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 12.768                                                           
##  3rd Qu.: 14.800                                                           
##  Max.   :180.790                                                           
##  NA's   :3461                                                              
##  Series Type        Series Category    Series Method      Age Group of Women
##  Length:5173        Length:5173        Length:5173        Length:5173       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Time Since First Birth Definition        Interval       Country         
##  Length:5173            Mode:logical   Min.   :1.000   Length:5173       
##  Class :character       NA's:5173      1st Qu.:1.000   Class :character  
##  Mode  :character                      Median :1.000   Mode  :character  
##                                        Mean   :1.697                     
##                                        3rd Qu.:1.000                     
##                                        Max.   :5.000                     
##                                        NA's   :768                       
##       Year     
##  Min.   :1955  
##  1st Qu.:2007  
##  Median :2023  
##  Mean   :2014  
##  3rd Qu.:2023  
##  Max.   :2023  
## 

Merge with Latest Data and Add Labels

#Latest data
get_latest <- function(df) {
  df %>%
    group_by(Country) %>%
    filter(Year == max(Year, na.rm = TRUE)) %>%
    ungroup() %>%
    select(Country, `Observation Value`, Year)
}

neonatal_latest <- get_latest(neonatal) %>% rename(NeonatalRate = `Observation Value`)
under5_latest <- get_latest(under5) %>% rename(Under5Rate = `Observation Value`)
neonatal_latest
## # A tibble: 425 × 3
##    Country NeonatalRate  Year
##    <chr>          <dbl> <dbl>
##  1 Burundi         49.7  2023
##  2 Burundi         49.9  2023
##  3 Burundi         50.0  2023
##  4 Burundi         49.9  2023
##  5 Burundi         49.9  2023
##  6 Burundi         50.0  2023
##  7 Burundi         49.9  2023
##  8 Burundi         49.7  2023
##  9 Burundi         50.7  2023
## 10 Burundi         49.3  2023
## # ℹ 415 more rows
under5_latest
## # A tibble: 2,887 × 3
##    Country Under5Rate  Year
##    <chr>        <dbl> <dbl>
##  1 Burundi       238.  2023
##  2 Burundi       240.  2023
##  3 Burundi       242.  2023
##  4 Burundi       244.  2023
##  5 Burundi       245.  2023
##  6 Burundi       246.  2023
##  7 Burundi       246.  2023
##  8 Burundi       314.  2023
##  9 Burundi       247.  2023
## 10 Burundi       246.  2023
## # ℹ 2,877 more rows
# Merge with shapefile
merge_shape <- function(shape, data, rate_col) {
  df <- left_join(shape, data, by = "Country")
  centroid_coords <- st_centroid(df) %>% st_coordinates() %>% as.data.frame()
  df$lon <- centroid_coords$X
  df$lat <- centroid_coords$Y
  df$Label <- ifelse(is.na(df[[rate_col]]), paste0(df$Country, "\nNA"), df$Country)
  return(df)
}

neonatal_map <- merge_shape(eac_shape, neonatal_latest, "NeonatalRate")
## Warning: st_centroid assumes attributes are constant over geometries
under5_map <- merge_shape(eac_shape, under5_latest, "Under5Rate")
## Warning: st_centroid assumes attributes are constant over geometries

#Plotting Choropleth Maps

plot_choropleth <- function(df, rate_col, title) {
  ggplot(df) +
    geom_sf(aes(fill = .data[[rate_col]]), color = "white") +
    geom_text(aes(x = lon, y = lat, label = Label), size = 3, color = "black") +
    scale_fill_viridis_c(
      name = "Deaths/1,000",
      option = "C",
      na.value = "lightgrey",
      direction = -1,  
  breaks = pretty(range(df[[rate_col]], na.rm = TRUE), n = 5)
    ) +
    labs(title = title) +
    theme_minimal()+
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
      legend.position = "right"
    )
}

plot_choropleth(neonatal_map, "NeonatalRate", "Latest Neonatal Mortality in EAC")

plot_choropleth(under5_map, "Under5Rate", "Latest Under-Five Mortality in EAC")




### Analyzing Trend Over Time


``` r
#Preparing Trend Data
prepare_trend <- function(df, rate_col) {
  df %>%
    group_by(Year) %>%
    mutate(AvgRate = mean(.data[[rate_col]], na.rm = TRUE)) %>%
    ungroup()
}
# Call the function with the correct rate column for each dataset
neonatal_trend <- prepare_trend(neonatal, "Observation Value")
under5_trend   <- prepare_trend(under5, "Observation Value")

neonatal_trend
## # A tibble: 622 × 26
##    REF_AREA `Geographic area` `Regional group` Indicator Sex   `Wealth Quintile`
##    <chr>    <chr>             <chr>            <chr>     <chr> <chr>            
##  1 BDI      Burundi           <NA>             Neonatal… Total Total            
##  2 BDI      Burundi           <NA>             Neonatal… Total Total            
##  3 BDI      Burundi           <NA>             Neonatal… Total Total            
##  4 BDI      Burundi           <NA>             Neonatal… Total Total            
##  5 BDI      Burundi           <NA>             Neonatal… Total Total            
##  6 BDI      Burundi           <NA>             Neonatal… Total Total            
##  7 BDI      Burundi           <NA>             Neonatal… Total Total            
##  8 BDI      Burundi           <NA>             Neonatal… Total Total            
##  9 BDI      Burundi           <NA>             Neonatal… Total Total            
## 10 BDI      Burundi           <NA>             Neonatal… Total Total            
## # ℹ 612 more rows
## # ℹ 20 more variables: `Series Name` <chr>, `Series Year` <chr>,
## #   `Reference Date` <dbl>, `Observation Value` <dbl>, `Lower Bound` <dbl>,
## #   `Upper Bound` <dbl>, `Standard Error` <dbl>, `Country notes` <chr>,
## #   `Observation Status` <chr>, `Unit of measure` <chr>, `Series Type` <chr>,
## #   `Series Category` <chr>, `Series Method` <chr>, `Age Group of Women` <chr>,
## #   `Time Since First Birth` <chr>, Definition <lgl>, Interval <dbl>, …
under5_trend
## # A tibble: 5,173 × 26
##    REF_AREA `Geographic area` `Regional group` Indicator Sex   `Wealth Quintile`
##    <chr>    <chr>             <chr>            <chr>     <chr> <chr>            
##  1 BDI      Burundi           <NA>             Under-fi… Fema… Total            
##  2 BDI      Burundi           <NA>             Under-fi… Fema… Total            
##  3 BDI      Burundi           <NA>             Under-fi… Fema… Total            
##  4 BDI      Burundi           <NA>             Under-fi… Fema… Total            
##  5 BDI      Burundi           <NA>             Under-fi… Fema… Total            
##  6 BDI      Burundi           <NA>             Under-fi… Fema… Total            
##  7 BDI      Burundi           <NA>             Under-fi… Fema… Total            
##  8 BDI      Burundi           <NA>             Under-fi… Fema… Total            
##  9 BDI      Burundi           <NA>             Under-fi… Fema… Total            
## 10 BDI      Burundi           <NA>             Under-fi… Fema… Total            
## # ℹ 5,163 more rows
## # ℹ 20 more variables: `Series Name` <chr>, `Series Year` <chr>,
## #   `Reference Date` <dbl>, `Observation Value` <dbl>, `Lower Bound` <dbl>,
## #   `Upper Bound` <dbl>, `Standard Error` <dbl>, `Country notes` <chr>,
## #   `Observation Status` <chr>, `Unit of measure` <chr>, `Series Type` <chr>,
## #   `Series Category` <chr>, `Series Method` <chr>, `Age Group of Women` <chr>,
## #   `Time Since First Birth` <chr>, Definition <lgl>, Interval <dbl>, …
plot_trends <- function(df, rate_col, title) {
  ggplot(df, aes(x = Year)) +
    geom_line(aes(y = AvgRate), color = "#0072B2", linetype = "dashed", size = 1.2) +
    geom_jitter(aes(y = .data[[rate_col]], color = Country), width = 0.3, size = 2, alpha = 0.6) +
    scale_color_viridis_d(name = "Country") +
    scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
    labs(title = title, y = "Deaths/1,000 live births", x = "Year") +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      plot.title = element_text(size = 14, face = "bold", hjust = 0.5)
    )
}
plot_trends(neonatal_trend, "Observation Value", "Neonatal Mortality Trends in EAC")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot_trends(under5_trend, "Observation Value", "Under-Five Mortality Trends in EAC")

Finding Country with the Highest Mortality Rates in EAC

#Highest Under 5
cat("Country with Highest Under-5 Mortality:\n")
## Country with Highest Under-5 Mortality:
under5_latest %>% 
  arrange(desc(Under5Rate)) %>% 
  select(Country, Under5Rate,Year)
## # A tibble: 2,887 × 3
##    Country     Under5Rate  Year
##    <chr>            <dbl> <dbl>
##  1 South Sudan       773.  2023
##  2 South Sudan       767.  2023
##  3 South Sudan       761.  2023
##  4 Somalia           480.  2023
##  5 Somalia           473.  2023
##  6 Somalia           468.  2023
##  7 Somalia           466.  2023
##  8 Somalia           464.  2023
##  9 Somalia           460.  2023
## 10 Somalia           456.  2023
## # ℹ 2,877 more rows
#Highest Neonatal
cat("\nCountry with Highest Neonatal Mortality:\n")
## 
## Country with Highest Neonatal Mortality:
neonatal_latest %>% 
  arrange(desc(NeonatalRate)) %>% 
  select(Country, NeonatalRate, Year)
## # A tibble: 425 × 3
##    Country     NeonatalRate  Year
##    <chr>              <dbl> <dbl>
##  1 South Sudan         65.8  2023
##  2 South Sudan         65.2  2023
##  3 South Sudan         64.8  2023
##  4 South Sudan         64.6  2023
##  5 South Sudan         64.1  2023
##  6 Rwanda              62.8  2023
##  7 Rwanda              62.8  2023
##  8 Rwanda              62.7  2023
##  9 Rwanda              62.5  2023
## 10 South Sudan         62.4  2023
## # ℹ 415 more rows