library(tidyverse)
library(rvest)
library(ggridges)
library(rstan)
# Load the webpage
<- "https://reseausportsadultes.com/quebec_levis/hockey/horaire/ligue/103/a24-h25?date=saison&segment=saison_reg&equipe="
url <- read_html(url)
page
# Extract team names (assuming they are under a tag with class 'b')
<- page %>% html_nodes("b") %>% html_text()
teams <- teams[-1]
teams <- trimws(str_replace(teams[1:(59*2)],"\n",""))[seq(1, 118, by = 2)]
t1 <- trimws(str_replace(teams[1:(59*2)],"\n",""))[seq(2, 118, by = 2)]
t2
# Extract scores (assuming they are under a tag with class 'scores')
<- page %>% html_nodes(".scores") %>% html_text()
scores <- trimws(str_replace(scores[1:59],"\n",""))
scores
<- sub(" .*", "", scores)
s1 <- sub(".* ", "", scores)
s2
<- data.frame(t1,t2,s1=as.numeric(s1),s2=as.numeric(s2))
data <- data %>%
data filter(str_detect("Les 2 frères du quartier|Location Beauport|Les Styfes Sud|Hawks|Los Bambinos|Les Grues JLR|Pens|Beerkings|Matract|Blue Kings",t1))
# Calculate score difference and create team indices
<- unique(c(data$t1, data$t2)) # Get unique team names
teams <- setNames(seq_along(teams), teams) # Create index for each team
team_indices
# Map team names to indices in data
<- data %>%
data mutate(score_diff = s1 - s2,
team1 = team_indices[t1],
team2 = team_indices[t2])
# Stan model as a string
<- "
stan_model_code data {
int<lower=0> N; // number of games
int<lower=1> T; // number of teams
int<lower=1, upper=T> team1[N]; // team 1 index
int<lower=1, upper=T> team2[N]; // team 2 index
vector[N] score_diff; // score difference for each game
}
parameters {
vector[T] ability; // ability for each team
real<lower=0> sigma; // standard deviation
}
model {
ability ~ normal(0, 2); // Prior on ability
score_diff ~ normal(ability[team1] - ability[team2], sigma);
}
"
# Compile the model
<- stan_model(model_code = stan_model_code)
stan_model
# Data list for Stan
<- list(
stan_data N = nrow(data),
T = length(unique(c(data$team1, data$team2))),
team1 = data$team1,
team2 = data$team2,
score_diff = data$score_diff
)
# Fit the model
<- sampling(stan_model, data = stan_data, iter = 2000, chains = 4) fit
Modéliser la différence de score entre deux équipes à chaque match comme étant normalement distribuée, la moyenne étant donnée par la différence de leurs habilités Les capacités des équipes sont également modélisées comme étant normalement distribuées avec un a priori :
\[ \text{ability}_j \sim \mathcal{N}(0, 2), \quad \text{for } j = 1, \dots, T \]
\[ \text{score\_diff}_i \sim \mathcal{N}(\text{ability}_{\text{team1}_i} - \text{ability}_{\text{team2}_i}, \sigma), \quad \text{for } i = 1, \dots, N \]
fit
Inference for Stan model: anon_model.
4 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=4000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
ability[1] 0.59 0.02 1.11 -1.67 -0.12 0.58 1.32 2.76 4010 1
ability[2] -4.89 0.02 1.20 -7.19 -5.69 -4.91 -4.09 -2.45 3878 1
ability[3] 1.16 0.02 1.15 -1.09 0.37 1.18 1.94 3.41 4085 1
ability[4] 1.87 0.02 1.13 -0.42 1.16 1.89 2.61 4.06 3780 1
ability[5] 1.63 0.02 1.15 -0.70 0.86 1.65 2.39 3.82 4149 1
ability[6] 0.49 0.02 1.12 -1.67 -0.24 0.48 1.23 2.70 3633 1
ability[7] 2.76 0.02 1.14 0.40 2.00 2.80 3.52 4.97 3616 1
ability[8] -1.65 0.02 1.13 -3.87 -2.41 -1.65 -0.88 0.53 4031 1
ability[9] 0.40 0.02 1.16 -1.86 -0.38 0.42 1.20 2.67 3990 1
ability[10] -2.22 0.02 1.15 -4.49 -3.03 -2.21 -1.42 0.02 3870 1
sigma 3.11 0.01 0.48 2.34 2.79 3.05 3.39 4.20 3310 1
lp__ -62.89 0.07 2.60 -68.93 -64.43 -62.49 -61.00 -58.97 1291 1
Samples were drawn using NUTS(diag_e) at Tue Nov 12 17:42:51 2024.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
<- tibble(as.data.frame(extract(fit)$ability))
abilities names(abilities) <- names(team_indices)
<- extract(fit)$sigma
sigma
%>%
abilities gather(team,ability) %>%
ggplot(aes(y = reorder(team,ability), x = ability)) +
geom_density_ridges() + labs(x='Habilité',y='Équipe')
On sait qu’on prédit le score final par:
\[ \text{score\_diff}_i \sim \mathcal{N}(\text{ability}_{\text{team1}_i} - \text{ability}_{\text{team2}_i}, \sigma), \quad \text{for } i = 1, \dots, N \]
Et on peut visualiser le différentiel de scores possible probabilistiquement si les Grues (mon équipe) joue contre chacune des 9 autre équipes:
<- abilities$`Les Grues JLR`-abilities
ev <- sapply(1:10,function(y){
simulated_games sapply(1:4000,function(x){
rnorm(1,ev[,y][x],sigma[x])})
})
<- tibble(as.data.frame(simulated_games))
simulated_games names(simulated_games) <- names(team_indices)
%>%
simulated_games gather(team,ability) %>%
filter(team!="Les Grues JLR") %>%
ggplot(aes(y = reorder(team,ability), x = ability)) +
geom_density_ridges() + labs(x="Prédiction d'un match contre cette équipe",y='Équipe') +
coord_cartesian(xlim = c(-10,10)) +
geom_vline(aes(xintercept=0))