Using grid search to characterize ATS and total variation by NFL team.
cstm Chuckling silently to myself. Used similarly to lol, but when you’re not actually laughing out loud. - Urban Dictionary
Each NFL game has a spread and point total. Over the course of a year, is a team an ‘over’ team, or an under team? Is there any correlation between a team’s over/under and ATS results?
This is the kind of thing I heard someone describing on a recent commute. I wanted to work a grid search example to estimate something without a closed form solution, so I came up with this as a toy problem.
Objective: For games played in week 1 thru 17, find the point which minimizes the total euclidean distance between a team’s ATS margin of victory and the over/under (i,e,. ‘total margin’).
I’ll pull in the 2024 Pittsburgh Steelers data and plot the ATS and total margin for each of the 16 games played as of week 17.
# Get 2024 Data
df <- nflfastR::load_pbp(2024) %>%
filter(
season_type == "REG" &
(home_team == "PIT" |
away_team == "PIT")
) %>%
# Keep only relevant columns
select(
game_id, season, week,
home_team, away_team,
home_score, away_score,
result, total,
spread_line, total_line
) %>%
# Final results are include on each row of each game.
# For each game, keep only a single instance.
group_by(game_id) %>%
slice_head(n = 1) %>%
ungroup() %>%
# Calculate margins
mutate(
total_margin = total - total_line,
home_ats_margin = result - spread_line,
away_ats_margin = -1L * home_ats_margin
)
pit_df <-
# Make a complete PIT record by combining PIT home games
# with PIT away games.
bind_rows(
df %>%
filter(
home_team == "PIT"
) %>%
mutate(
team = "PIT",
pit_ats_margin =
home_ats_margin,
opponent = away_team
) %>%
select(
week, team, opponent,
pit_ats_margin, total_margin
),
df %>%
filter(
away_team == "PIT"
) %>%
mutate(
team = "PIT",
pit_ats_margin =
away_ats_margin,
opponent = home_team
) %>%
select(
week, team, opponent,
pit_ats_margin, total_margin
)
) %>%
arrange(week)
pit_df %>%
ggplot(
aes(
x = pit_ats_margin,
y = total_margin
)
) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
geom_nfl_logos(
aes(
team_abbr = opponent
),
width = 0.05
) +
scale_x_continuous(
breaks = seq(-60, 60, by = 6),
minor_breaks = NULL
) +
scale_y_continuous(
breaks = seq(-60, 60, by = 6),
minor_breaks = NULL
) +
coord_fixed() +
labs(
x = "Steelers ATS Margin (pts)",
y = "Game Total Margin (pts)",
title = "2024 Steelers (Thru Week 17)",
subtitle = "Logo = Opponent.",
caption = "Data: nflfastR:load_pbp(2024)"
) +
theme_light()
Next, I’ll make a grid of candidate solutions to check. For each point, I’ll calculate the total euclidean distance between the candidate point and all 16 game results.
Then, I’ll come to a point estimate by averaging over the the 10 points with the best performance (i.e., least total euclidean distance).
I’ll use a contour plot to see patterns in total euclidean distance across the candidate grid to see if this is sensible.
# Make a grid of values to search over.
pit_candidate_grid <-
crossing(
pit_ats_margin = seq(-14, 14, by = 0.5),
total_margin = seq(-14, 14, by = 0.5)
) %>%
# Metric to be minimized will be euclidean distance
# from each candidate pair.
mutate(
sum_euclidean_dist = NA_real_
)
pit_iteration_df <-
pit_df %>%
mutate(
euclidean_dist = NA_real_
)
for(i in 1:nrow(pit_candidate_grid)){
pit_iteration_df <- pit_iteration_df %>%
mutate(
euclidean_dist =
sqrt(
(pit_ats_margin - pit_candidate_grid$pit_ats_margin[i])^2 +
(total_margin - pit_candidate_grid$total_margin[i])^2
)
)
pit_candidate_grid$sum_euclidean_dist[i] =
sum(pit_iteration_df$euclidean_dist)
}
pit_estimate_df <-
pit_candidate_grid %>%
# Take the 10 best candidates...
slice_min(
order_by = sum_euclidean_dist,
n = 10
) %>%
# and average over each component
# to develop point estimate.
summarize(
pit_ats_margin = mean(pit_ats_margin),
total_margin = mean(total_margin)
)
pit_candidate_grid %>%
ggplot(
aes(
x = pit_ats_margin,
y = total_margin
)
) +
geom_contour_filled(
aes(
z = sum_euclidean_dist
),
binwidth = 10
) +
geom_vline(xintercept = 0, color = "white") +
geom_hline(yintercept = 0, color = "white") +
geom_nfl_logos(
data = pit_df,
aes(
team_abbr = opponent
),
width = 0.05
) +
geom_point(
data = pit_candidate_grid %>%
slice_min(
order_by = sum_euclidean_dist,
n = 10
),
color = "red"
) +
geom_point(
data = pit_estimate_df,
color = "white",
shape = 2
) +
scale_x_continuous(
breaks = seq(-60, 60, by = 6),
minor_breaks = NULL
) +
scale_y_continuous(
breaks = seq(-60, 60, by = 6),
minor_breaks = NULL
) +
coord_fixed(
xlim = c(-14, 14),
ylim = c(-14, 14)
) +
labs(
x = "Steelers ATS Margin (pts)",
y = "Game Total Margin (pts)",
title = "2024 Steelers (Thru Week 17)",
subtitle = "Logo = Opponent. Red = 10 Best Estimates.\nWhite = Estimate (Average Over Red).",
caption = "Data: nflfastR:load_pbp(2024)"
) +
theme_light() +
theme(legend.position = "none")
Here, I write a function to do this for all teams in the league for a given season.
# League wide for season
calculate_vegas_margins <-
function(team_of_interest, season_of_interest = 2024L){
# Load regular season data for the season of interest.
df <- nflfastR::load_pbp(season_of_interest) %>%
filter(
season_type == "REG" &
week <= 17L
) %>%
select(
game_id, season, week,
home_team, away_team,
home_score, away_score,
result, total,
spread_line, total_line
)
# Make a vector of all teams that appear in the season
# of interest.
teams <-
df %>%
pull(home_team) %>%
unique() %>%
sort()
# Make a data frame of all final game results.
df_results <-
df %>%
group_by(game_id) %>%
slice_head(n = 1) %>%
ungroup() %>%
mutate(
total_margin = total - total_line,
home_ats_margin = result - spread_line,
away_ats_margin = -1L * home_ats_margin
)
home_df <-
df_results %>%
rename(
team = home_team,
opponent = away_team,
ats_margin = home_ats_margin,
) %>%
mutate(
home_away = "home"
) %>%
select(
season, week,
team, opponent, home_away,
total_margin, ats_margin
)
away_df <-
df_results %>%
rename(
team = away_team,
opponent = home_team,
ats_margin = away_ats_margin,
) %>%
mutate(
home_away = "away"
) %>%
select(
season, week,
team, opponent, home_away,
total_margin, ats_margin
)
long_df <-
bind_rows(
home_df, away_df
) %>%
arrange(
season, team, week
)
# Search grid for point that minimizes the
# Euclidean distance from each games ATS margin
# and over/under total margin.
candidate_grid <-
crossing(
ats_margin = seq(-14, 14, by = 0.5),
total_margin = seq(-14, 14, by = 0.5)
) %>%
mutate(
sum_euclidean_dist = NA_real_
)
# Stores total euclidean distance for each
# candidate pair of ats_margin and total_margin.
iteration_df <-
long_df %>%
filter(
team == team_of_interest &
season == season_of_interest
) %>%
mutate(
euclidean_dist = NA_real_
)
for(i in 1:nrow(candidate_grid)){
iteration_df <- iteration_df %>%
mutate(
euclidean_dist =
sqrt(
(ats_margin - candidate_grid$ats_margin[i])^2 +
(total_margin - candidate_grid$total_margin[i])^2
)
)
candidate_grid$sum_euclidean_dist[i] =
sum(iteration_df$euclidean_dist)
}
return_df <-
candidate_grid %>%
slice_min(
order_by = sum_euclidean_dist,
n = 10
) %>%
summarize(
ats_margin = mean(ats_margin),
total_margin = mean(total_margin)
)
return(return_df)
}
all_teams <-
load_pbp(2023) %>%
pull(home_team) %>%
unique() %>%
sort()
all_team_results_2024 <-
map_df(
.x = all_teams,
.f = calculate_vegas_margins,
season_of_interest = 2024L
) %>%
bind_cols(
df =
data.frame(
team = all_teams,
season = 2024L
)
)
all_team_results_2024 %>%
ggplot(
aes(
x = ats_margin,
y = total_margin
)
) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
geom_nfl_logos(
aes(
team_abbr = team
),
width = 0.05
) +
scale_x_continuous(
breaks = seq(-18, 18, by = 3),
minor_breaks = NULL
) +
scale_y_continuous(
breaks = seq(-18, 18, by = 3),
minor_breaks = NULL
) +
coord_fixed() +
labs(
x = "Central ATS Margin (pts)",
y = "Central Total Margin (pts)",
title = "2024 (Thru Week 17)",
subtitle = "Logo = Team Point Estimate.",
caption = "Data: nflfastR:load_pbp(2024)"
) +
theme_light()
all_team_results_2023 <-
map_df(
.x = all_teams,
.f = calculate_vegas_margins,
season_of_interest = 2023L
) %>%
bind_cols(
df =
data.frame(
team = all_teams,
season = 2023L
)
)
all_team_results_2023 %>%
ggplot(
aes(
x = ats_margin,
y = total_margin
)
) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
geom_nfl_logos(
aes(
team_abbr = team
),
width = 0.05
) +
scale_x_continuous(
breaks = seq(-18, 18, by = 3),
minor_breaks = NULL
) +
scale_y_continuous(
breaks = seq(-18, 18, by = 3),
minor_breaks = NULL
) +
coord_fixed() +
labs(
x = "Central ATS Margin (pts)",
y = "Central Total Margin (pts)",
title = "2023 (Thru Week 17)",
subtitle = "Logo = Team Point Estimate.",
caption = "Data: nflfastR:load_pbp(2023)"
) +
theme_light()