# Importing Packages

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

# Custom
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_formats = ['svg'] # makes everything svg by default
%matplotlib inline


# Read Data

dataset = pd.read_excel('ds/EPL2017-18.xlsx')

display( dataset.head() )


# Cleanup
dataset['count'] = 1

dataset['hwinvalue'] = np.where( dataset['FTR']=='H',1, np.where(dataset['FTR']=='D',.5,0) )
dataset['awinvalue'] = np.where( dataset['FTR']=='A',1, np.where(dataset['FTR']=='D',.5,0) )

home1 = dataset[dataset.Date < 20180000].groupby(['HomeTeam'])['count','hwinvalue', 'FTHG','FTAG']\
    .sum().reset_index()
home1 = home1.rename(columns={'HomeTeam':'Team','count':'MPh','FTHG':'GFh', 'FTAG':'GAh'})

away1 = dataset[dataset.Date < 20180000].groupby(['AwayTeam'])['count','awinvalue', 'FTHG','FTAG']\
    .sum().reset_index()
away1 = away1.rename(columns={'AwayTeam':'Team','count':'MPa','FTHG':'GAa','FTAG':'GFa'})
# because my goals in away ground will be home goals against for the other team


home2 = dataset[dataset.Date > 20180000].groupby(['HomeTeam'])['count','hwinvalue', 'FTHG','FTAG']\
    .sum().reset_index()
home2 = home2.rename(columns={'HomeTeam':'Team','count':'MPh','FTHG':'GFh', 'FTAG':'GAh'})

away2 = dataset[dataset.Date > 20180000].groupby(['AwayTeam'])['count','awinvalue', 'FTHG','FTAG']\
    .sum().reset_index()
away2 = away2.rename(columns={'AwayTeam':'Team','count':'MPa','FTHG':'GAa','FTAG':'GFa'})
# because my goals in away ground will be home goals against for the other team

half1 = pd.merge(home1, away1, on="Team")
half2 = pd.merge(home2, away2, on="Team")


# Evaluations
halves = [half1, half2]

for half in halves:
    half["MP"] = half["MPh"] + half["MPa"]
    half["wValue"] = half["hwinvalue"] + half["awinvalue"]
    half["GF"] = half["GFh"] + half["GFa"]
    half["GA"] = half["GAh"] + half["GAa"]


half1["pyth1"] = (half1["GF"]**2) / (half1["GF"]**2 + half1["GA"]**2)
half1["wpc1"] = half1["wValue"]/half1["MP"]


half2["pyth2"] = (half2["GF"]**2) / (half2["GF"]**2 + half2["GA"]**2)
half2["wpc2"] = half2["wValue"]/half2["MP"]


# Cleaned up Dataset
dropCols = ["MPh", "hwinvalue", "GFh", "GAh", "MPa", "awinvalue", "GFa", "GAa"]

for half in halves:
    display( 
        half.drop(columns = dropCols).head()
    )


# using half 1 pyth as predictor for half 2 wpc
predictor = pd.merge(half1, half2, on = "Team")
display(predictor.head())


title = "2nd Half Win% vs 1st Half Win %"

sns.relplot(x="wpc1", y="wpc2", data = predictor)
plt.title(title)
plt.xlim(0, 1), plt.ylim(0, 1)
plt.savefig(title + ".svg", dpi=300, bbox_inches = 'tight')
plt.show()


title = "2nd Half Win% vs 1st Half Pythagorean Expectation"

# Plotting
sns.relplot(x="pyth1", y="wpc2", data = predictor)
plt.title(title)
plt.xlim(0, 1), plt.ylim(0, 1)
plt.savefig(title + ".svg", dpi=300, bbox_inches = 'tight')
plt.show()


regression = smf.ols(formula = 'wpc2 ~ wpc1', data=predictor).fit()
regression.summary()


# Regression

regression = smf.ols(formula = 'wpc2 ~ pyth1', data=predictor).fit()
regression.summary()


# correlation matrix

values = predictor[['Team', 'wpc1', 'wpc2', 'pyth1', 'pyth2']]
display( values.corr() )


# Quiz Questions


print(
    "How many EPL games from this season were played in 2018?"
    + "\n" +
    str(dataset[dataset.Date > 20180000].shape[0])
)

print(
    "Which team scored the highest number of goals while playing at home in the first half of the season?"
    + "\n" +
    half1.sort_values("GFh", ascending=False).iloc[0][0]
)

print(
    "Which team conceded the highest number of goals while playing away in the first half of the season?"
    + "\n" +
    half1.sort_values("GAa", ascending=False).iloc[0][0]
)

half1['dev'] = abs(half1['wpc1'] - half1['pyth1'])
print(
    "Which of the following teams had the smallest difference between their win percentage and Pythagorean expectation in the first half of the season?"
)
display( half1.sort_values("dev", ascending=True).head() )
print("Mancity")
print(
    "Which of the following teams had the smallest difference between their win percentage and Pythagorean expectation in the first half of the season?"
)
display( half1.sort_values("dev", ascending=True).head() )
print("Leicester")


print(
    "Which of the following teams had the highest value for away wins (awinvalue) for in the first half of the season?"
)
display( half1.sort_values("awinvalue", ascending=False).tail() )

half2['gap'] = abs(half2['hwinvalue'] - half2['awinvalue'])
print(
    "Which team had the largest gap between home points won (hwinvalue) and away points won (awinvalue) in the second half the season?"
    + "\n" +
    half2.sort_values("gap", ascending=False).iloc[0][0]
)

print(
    "What was the correlation between win percentage and the Pythagorean expectation in the first half of the season?"
)
display(
    round(values.corr().iloc[0, 2], 3)
)


print(
    "What was the correlation between win percentage in the first half of the season and the second half of the season?"
)
display(
    round(values.corr().iloc[0, 1], 3)
)

print(
    "What was the correlation between win percentage in the second half of the season and the Pythagorean expectation in the first half of the season?"
)
display(
    round(values.corr().iloc[1, 2], 3)
)

How many EPL games from this season were played in 2018?
171
Which team scored the highest number of goals while playing at home in the first half of the season?
Man City
Which team conceded the highest number of goals while playing away in the first half of the season?
Stoke
Which of the following teams had the smallest difference between their win percentage and Pythagorean expectation in the first half of the season?

Mancity
Which of the following teams had the smallest difference between their win percentage and Pythagorean expectation in the first half of the season?

Leicester
Which of the following teams had the highest value for away wins (awinvalue) for in the first half of the season?

Which team had the largest gap between home points won (hwinvalue) and away points won (awinvalue) in the second half the season?
Arsenal
What was the correlation between win percentage and the Pythagorean expectation in the first half of the season?

0.968

What was the correlation between win percentage in the first half of the season and the second half of the season?

0.757

What was the correlation between win percentage in the second half of the season and the Pythagorean expectation in the first half of the season?

0.796

	Date	HomeTeam	AwayTeam	FTHG	FTAG	FTR
0	20170811	Arsenal	Leicester	4	3	H
1	20170812	Brighton	Man City	0	2	A
2	20170812	Chelsea	Burnley	2	3	A
3	20170812	Crystal Palace	Huddersfield	0	3	A
4	20170812	Everton	Stoke	1	0	H

	Team	MP	wValue	GF	GA	pyth1	wpc1
0	Arsenal	21	13.5	38	26	0.681132	0.642857
1	Bournemouth	21	7.5	20	32	0.280899	0.357143
2	Brighton	21	8.5	15	25	0.264706	0.404762
3	Burnley	21	12.5	18	17	0.528548	0.595238
4	Chelsea	21	15.5	39	14	0.885847	0.738095

	Team	MP	wValue	GF	GA	pyth2	wpc2
0	Arsenal	17	8.5	36	25	0.674649	0.500000
1	Bournemouth	17	9.0	25	29	0.426330	0.529412
2	Brighton	17	7.0	19	29	0.300333	0.411765
3	Burnley	17	7.5	18	22	0.400990	0.441176
4	Chelsea	17	9.0	23	24	0.478733	0.529412

	Team	MPh_x	hwinvalue_x	GFh_x	GAh_x	MPa_x	awinvalue_x	GAa_x	GFa_x	MP_x	...	awinvalue_y	GAa_y	GFa_y	MP_y	wValue_y	GF_y	GA_y	pyth2	wpc2	gap
0	Arsenal	10	8.5	25	10	11	5.0	16	13	21	...	1.0	15	7	17	8.5	36	25	0.674649	0.500000	6.5
1	Bournemouth	11	4.5	14	17	10	3.0	15	6	21	...	4.0	16	13	17	9.0	25	29	0.426330	0.529412	1.0
2	Brighton	10	5.5	10	12	11	3.0	13	5	21	...	1.5	16	5	17	7.0	19	29	0.300333	0.411765	4.0
3	Burnley	10	6.0	7	6	11	6.5	11	11	21	...	4.0	11	9	17	7.5	18	22	0.400990	0.441176	0.5
4	Chelsea	11	8.5	21	7	10	7.0	7	18	21	...	4.5	15	14	17	9.0	23	24	0.478733	0.529412	0.0

Dep. Variable:	wpc2	R-squared:	0.572
Model:	OLS	Adj. R-squared:	0.549
Method:	Least Squares	F-statistic:	24.10
Date:	Sat, 09 Apr 2022	Prob (F-statistic):	0.000113
Time:	08:19:08	Log-Likelihood:	18.002
No. Observations:	20	AIC:	-32.00
Df Residuals:	18	BIC:	-30.01
Df Model:	1
Covariance Type:	nonrobust

English Premier League (EPL) Pythagorean Predictor¶

Pythagorean Expectation¶

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.1800	0.069	2.607	0.018	0.035	0.325
wpc1	0.6384	0.130	4.909	0.000	0.365	0.912

Omnibus:	2.990	Durbin-Watson:	1.996
Prob(Omnibus):	0.224	Jarque-Bera (JB):	1.319
Skew:	0.196	Prob(JB):	0.517
Kurtosis:	1.805	Cond. No.	7.05

Omnibus:	4.877	Durbin-Watson:	2.048
Prob(Omnibus):	0.087	Jarque-Bera (JB):	1.521
Skew:	-0.033	Prob(JB):	0.467
Kurtosis:	1.650	Cond. No.	4.65

	wpc1	wpc2	pyth1	pyth2
wpc1	1.000000	0.756573	0.968204	0.745832
wpc2	0.756573	1.000000	0.795693	0.955986
pyth1	0.968204	0.795693	1.000000	0.795331
pyth2	0.745832	0.955986	0.795331	1.000000

	Team	MPh	hwinvalue	GFh	GAh	MPa	awinvalue	GAa	GFa	MP	wValue	GF	GA	pyth1	wpc1	dev
8	Leicester	10	5.0	13	14	11	5.0	18	18	21	10.0	31	32	0.484131	0.476190	0.007941
10	Man City	10	9.5	36	7	11	10.5	5	25	21	20.0	61	12	0.962743	0.952381	0.010362
17	Watford	11	4.5	14	23	10	4.5	14	16	21	9.0	30	37	0.396651	0.428571	0.031921
0	Arsenal	10	8.5	25	10	11	5.0	16	13	21	13.5	38	26	0.681132	0.642857	0.038275
12	Newcastle	11	4.0	9	13	10	3.0	17	10	21	7.0	19	30	0.286281	0.333333	0.047053

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.2897	0.043	6.690	0.000	0.199	0.381
pyth1	0.4543	0.082	5.573	0.000	0.283	0.626