The following objects are masked from 'package:base':
date, intersect, setdiff, union
parliamentarians <-read_excel("~/Downloads/Parliamentarians.xlsx")# filter by whether the character string "MP" is in the variable `Type of Parliamentarian`parliamentarians <- parliamentarians |>filter(str_detect(`Type of Parliamentarian`, "MP"))# create (mutate) variables for start and end date of the parliamentarian# str_sub(x,5,14) returns the values between positions 5 and 14# e.g. str_sub("abcdefghytr56",3,10) returns "cdefghyt"# yms() coerces dates-as-string to date format parliamentarians <- parliamentarians |>mutate(date_started =ymd(str_sub(`Type of Parliamentarian`, 5, 14))) |>mutate(date_ended =str_sub(`Type of Parliamentarian`, 18, 27))# Clean and extract party and province. These are regular expressions and it's fine if it's not clear. Regular expression extract/replace data based on patterns.# If you are curious, take a look at this:# https://jfjelstul.github.io/regular-expressions-tutorial/parliamentarians <- parliamentarians |>mutate(party =trimws(str_extract(`Political Affiliation`, "[^\\()]+")),province =str_extract(`Province/Territory`, "[^\\\r]+"))# Change format of date to date formatparliamentarians <- parliamentarians |>mutate(date_ended =ymd(date_ended, quiet =TRUE))# If no end date, Parliamentarian is still in HoC so put today as end date.# If you are very curious, look up difference between ifelse() and if_else()parliamentarians <- parliamentarians |>mutate(date_ended =if_else(is.na(date_ended), ymd(today()), date_ended))# Keep only those parties, let's not keep the small parties / indpsnames_keep <-c("Conservative", "Conservative Party of Canada", "Liberal Party of Canada", "New Democratic Party", "Progressive Conservative Party")# Keep only variables we need# Delete rows where missing valuesparliamentarians <- parliamentarians |>filter(party %in% names_keep) |>select(Name, Gender, party, date_started, date_ended, province) |>na.omit()##### END OF CLEANING# Make Figure 1ggplot(parliamentarians,aes(x=Gender)) +geom_bar()
# Save Figure 1#ggsave("~/Desktop/figure1.png")to_plot <- parliamentarians |>group_by(Gender) |>summarise(count =n())ggplot(to_plot,aes(x=Gender,y=count)) +geom_bar(stat='identity')
#ggsave("~/Desktop/figure3.png")# Plot and save figure 4ggplot(parliamentarians,aes(x=Gender,fill=party)) +geom_bar() +facet_grid(cols=vars(party),rows=vars(year_period)) +theme(legend.position ="none") +scale_fill_manual(values=c("darkblue","darkblue","red","orange","darkblue"))
#ggsave("~/Desktop/figure4.png")# Here's one way to fix the text that does not all appear on the same line for partiesggplot(parliamentarians,aes(x=Gender,fill=party)) +geom_bar() +facet_grid(cols=vars(party),rows=vars(year_period),labeller =label_wrap_gen(width =10)) +theme(legend.position ="none") +scale_fill_manual(values=c("darkblue","darkblue","red","orange","darkblue"))
# Here's another way parliamentarians <- parliamentarians |>mutate(party_ =str_wrap(party,10))ggplot(parliamentarians,aes(x=Gender,fill=party_)) +geom_bar() +facet_grid(cols=vars(party),rows=vars(year_period),labeller =label_wrap_gen(width =10)) +theme(legend.position ="none") +scale_fill_manual(values=c("darkblue","darkblue","red","orange","darkblue"))
# Recode party variableparliamentarians <- parliamentarians |>mutate(party =recode(party, `Conservative Party of Canada`="Conservative", `Progressive Conservative Party`="Conservative"))ggplot(parliamentarians,aes(x=Gender,fill=party)) +geom_bar() +facet_grid(cols=vars(party),rows=vars(year_period)) +theme(legend.position ="none") +scale_fill_manual(values=c("darkblue","red","orange"))
ggplot(data=parliamentarians|>filter(Name!='Brecken, Frederick de St Croix'),aes(x=year_in_office)) +geom_density()
ggplot(data=parliamentarians|>filter(Name!='Brecken, Frederick de St Croix'),aes(x=year_in_office,fill=Gender)) +geom_density(alpha=0.6)
ggplot(data=parliamentarians|>filter(Name!='Brecken, Frederick de St Croix'),aes(x=year_in_office,fill=party)) +geom_density() +facet_grid(cols=vars(party),rows=vars(Gender)) +scale_fill_manual(values=c("darkblue","red","orange"))
ggplot(data=parliamentarians|>filter(Name!='Brecken, Frederick de St Croix'),aes(x=year_in_office,fill=party)) +geom_histogram() +facet_grid(cols=vars(party),rows=vars(Gender)) +scale_fill_manual(values=c("darkblue","red","orange"))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.