Homework 1 code.qmd

library(readxl)
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

✔ ggplot2 3.4.2     ✔ purrr   1.0.1
✔ tibble  3.2.1     ✔ dplyr   1.1.2
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.2     ✔ forcats 0.5.2

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union

parliamentarians <- read_excel("~/Downloads/Parliamentarians.xlsx")

# filter by whether the character string "MP" is in the variable `Type of Parliamentarian`
parliamentarians <- parliamentarians |>
  filter(str_detect(`Type of Parliamentarian`, "MP"))

# create (mutate) variables for start and end date of the parliamentarian
# str_sub(x,5,14) returns the values between positions 5 and 14
# e.g. str_sub("abcdefghytr56",3,10) returns "cdefghyt"
# yms() coerces dates-as-string to date format 
parliamentarians <- parliamentarians |>
  mutate(date_started = ymd(str_sub(`Type of Parliamentarian`, 5, 14))) |>
  mutate(date_ended = str_sub(`Type of Parliamentarian`, 18, 27))

# Clean and extract party and province. These are regular expressions and it's fine if it's not clear. Regular expression extract/replace data based on patterns.
# If you are curious, take a look at this:
# https://jfjelstul.github.io/regular-expressions-tutorial/
parliamentarians <- parliamentarians |>
  mutate(party = trimws(str_extract(`Political Affiliation`, "[^\\()]+")),
         province = str_extract(`Province/Territory`, "[^\\\r]+"))

# Change format of date to date format
parliamentarians <- parliamentarians |>
  mutate(date_ended = ymd(date_ended, quiet = TRUE))
# If no end date, Parliamentarian is still in HoC so put today as end date.
# If you are very curious, look up difference between ifelse() and if_else()
parliamentarians <- parliamentarians |>
  mutate(date_ended = if_else(is.na(date_ended), ymd(today()), date_ended))

# Keep only those parties, let's not keep the small parties / indps
names_keep <- c("Conservative", "Conservative Party of Canada", "Liberal Party of Canada", 
                "New Democratic Party", "Progressive Conservative Party")
# Keep only variables we need
# Delete rows where missing values
parliamentarians <- parliamentarians |>
  filter(party %in% names_keep) |>
  select(Name, Gender, party, date_started, date_ended, province) |>
  na.omit()

##### END OF CLEANING

# Make Figure 1
ggplot(parliamentarians,aes(x=Gender)) +
  geom_bar()

# Save Figure 1
#ggsave("~/Desktop/figure1.png")

to_plot <- parliamentarians |>
  group_by(Gender) |>
  summarise(count = n())

ggplot(to_plot,aes(x=Gender,y=count)) +
  geom_bar(stat='identity')

#ggsave("~/Desktop/figure2.png",width=8,height=6)

parliamentarians <- parliamentarians |>
  mutate(year = year(date_started),
         year_period = cut(year, breaks = c(1860, 1900, 1950, 2000, 2025), dig.lab = 4))
ggplot(parliamentarians,aes(x=Gender)) +
  geom_bar() +
  facet_wrap(~year_period) + 
  scale_y_continuous(limits=c(0,1300))

#ggsave("~/Desktop/figure3.png")

# Plot and save figure 4
ggplot(parliamentarians,aes(x=Gender,fill=party)) +
  geom_bar() +
  facet_grid(cols=vars(party),rows=vars(year_period)) +
  theme(legend.position = "none") +
  scale_fill_manual(values=c("darkblue","darkblue","red","orange","darkblue"))

#ggsave("~/Desktop/figure4.png")

# Here's one way to fix the text that does not all appear on the same line for parties

ggplot(parliamentarians,aes(x=Gender,fill=party)) +
  geom_bar() +
  facet_grid(cols=vars(party),rows=vars(year_period),labeller = label_wrap_gen(width = 10)) +
  theme(legend.position = "none") +
  scale_fill_manual(values=c("darkblue","darkblue","red","orange","darkblue"))

# Here's another way  

parliamentarians <- parliamentarians |> 
  mutate(party_ = str_wrap(party,10))

ggplot(parliamentarians,aes(x=Gender,fill=party_)) +
  geom_bar() +
  facet_grid(cols=vars(party),rows=vars(year_period),labeller = label_wrap_gen(width = 10)) +
  theme(legend.position = "none") +
  scale_fill_manual(values=c("darkblue","darkblue","red","orange","darkblue"))

# Recode party variable
parliamentarians <- parliamentarians |>
  mutate(party = recode(party, 
                        `Conservative Party of Canada` = "Conservative", 
                        `Progressive Conservative Party` = "Conservative"))

ggplot(parliamentarians,aes(x=Gender,fill=party)) +
  geom_bar() +
  facet_grid(cols=vars(party),rows=vars(year_period)) +
  theme(legend.position = "none") +
  scale_fill_manual(values=c("darkblue","red","orange"))

#ggsave("~/Desktop/figure5.png")

to_plot <- parliamentarians |>
  group_by(Gender,party,year_period) |>
  count()

ggplot(to_plot,aes(x=Gender,y=n,fill=party)) +
  geom_bar(stat = 'identity') +
  facet_grid(cols=vars(party),rows=vars(year_period)) +
  theme(legend.position = "none") +
  scale_fill_manual(values=c("darkblue","red","orange"))

parliamentarians <- parliamentarians |>
  mutate(year_in_office=year(date_ended)-year(date_started))

parliamentarians <- parliamentarians |>
  filter(year_period!="(2000,2025]")

ggplot(data=parliamentarians,aes(x=year_in_office)) +
  geom_density()

ggplot(data=parliamentarians|>filter(Name!='Brecken, Frederick de St Croix'),aes(x=year_in_office)) +
  geom_density()

ggplot(data=parliamentarians|>filter(Name!='Brecken, Frederick de St Croix'),aes(x=year_in_office,fill=Gender)) +
  geom_density(alpha=0.6)

ggplot(data=parliamentarians|>filter(Name!='Brecken, Frederick de St Croix'),aes(x=year_in_office,fill=party)) +
  geom_density() +
  facet_grid(cols=vars(party),rows=vars(Gender)) +
  scale_fill_manual(values=c("darkblue","red","orange"))

ggplot(data=parliamentarians|>filter(Name!='Brecken, Frederick de St Croix'),aes(x=year_in_office,fill=party)) +
  geom_histogram() +
  facet_grid(cols=vars(party),rows=vars(Gender)) +
  scale_fill_manual(values=c("darkblue","red","orange"))

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.