Load Packages¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway
from scipy.stats import shapiro
from scipy.stats import levene
from scipy.stats import pearsonr
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
Load Dataset¶
In [2]:
data = pd.read_excel('Strikers_performance.xlsx')
data.head()
Out[2]:
Striker_ID | Nationality | Footedness | Marital Status | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | Hold-up Play | Aerial Duels Won | Defensive Contribution | Big Game Performance | Consistency | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Spain | Left-footed | No | 17.483571 | 10.778533 | 34.795488 | 0.677836 | 0.166241 | 0.757061 | 50.921924 | 71.806409 | 15.682532 | 30.412215 | 6.152481 | 0.820314 | 0.922727 | 8.570370 | 11.451388 |
1 | 2 | France | Left-footed | Yes | 14.308678 | 13.728250 | 31.472436 | 0.544881 | 0.192774 | 0.796818 | 61.396150 | 53.726866 | 19.843983 | 26.474913 | 6.093172 | 0.803321 | 0.678984 | 3.444638 | 8.243689 |
2 | 3 | Germany | Left-footed | No | 18.238443 | 3.804297 | 25.417413 | 0.518180 | 0.160379 | 0.666869 | 65.863945 | 60.452227 | 20.090084 | 24.164116 | 3.408714 | 0.766540 | 0.843858 | 8.429491 | 9.506835 |
3 | 4 | France | Right-footed | No | 22.615149 | 9.688908 | 20.471443 | 0.599663 | 0.184602 | 0.638776 | 88.876877 | 60.511979 | 22.363152 | 44.129989 | 6.339820 | 0.611798 | 0.662997 | 6.532552 | 8.199653 |
4 | 5 | France | Left-footed | Yes | 13.829233 | 6.048072 | 29.887563 | 0.582982 | 0.105319 | 0.591485 | 75.565531 | 54.982158 | 13.165708 | 37.859323 | 8.465658 | 0.701638 | 0.906538 | 8.414915 | 6.665333 |
Data Cleaning¶
Missing values¶
In [3]:
data.isnull().sum()
Out[3]:
Striker_ID 0 Nationality 0 Footedness 0 Marital Status 0 Goals Scored 0 Assists 0 Shots on Target 0 Shot Accuracy 0 Conversion Rate 0 Dribbling Success 0 Movement off the Ball 6 Hold-up Play 0 Aerial Duels Won 0 Defensive Contribution 0 Big Game Performance 2 Consistency 0 Penalty Success Rate 5 Impact on Team Performance 0 Off-field Conduct 0 dtype: int64
In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
data[['Movement off the Ball',
'Big Game Performance',
'Penalty Success Rate']] = imputer.fit_transform(data[['Movement off the Ball',
'Big Game Performance',
'Penalty Success Rate']])
data.isnull().sum()
Out[4]:
Striker_ID 0 Nationality 0 Footedness 0 Marital Status 0 Goals Scored 0 Assists 0 Shots on Target 0 Shot Accuracy 0 Conversion Rate 0 Dribbling Success 0 Movement off the Ball 0 Hold-up Play 0 Aerial Duels Won 0 Defensive Contribution 0 Big Game Performance 0 Consistency 0 Penalty Success Rate 0 Impact on Team Performance 0 Off-field Conduct 0 dtype: int64
Data Types¶
In [5]:
data.dtypes
Out[5]:
Striker_ID int64 Nationality object Footedness object Marital Status object Goals Scored float64 Assists float64 Shots on Target float64 Shot Accuracy float64 Conversion Rate float64 Dribbling Success float64 Movement off the Ball float64 Hold-up Play float64 Aerial Duels Won float64 Defensive Contribution float64 Big Game Performance float64 Consistency float64 Penalty Success Rate float64 Impact on Team Performance float64 Off-field Conduct float64 dtype: object
In [6]:
variables = ['Goals Scored', 'Assists',
'Shots on Target',
'Movement off the Ball',
'Hold-up Play',
'Aerial Duels Won',
'Defensive Contribution',
'Big Game Performance',
'Impact on Team Performance',
'Off-field Conduct']
for var in variables:
data[var] = data[var].astype('int')
data.dtypes
Out[6]:
Striker_ID int64 Nationality object Footedness object Marital Status object Goals Scored int64 Assists int64 Shots on Target int64 Shot Accuracy float64 Conversion Rate float64 Dribbling Success float64 Movement off the Ball int64 Hold-up Play int64 Aerial Duels Won int64 Defensive Contribution int64 Big Game Performance int64 Consistency float64 Penalty Success Rate float64 Impact on Team Performance int64 Off-field Conduct int64 dtype: object
In [7]:
data.head()
Out[7]:
Striker_ID | Nationality | Footedness | Marital Status | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | Hold-up Play | Aerial Duels Won | Defensive Contribution | Big Game Performance | Consistency | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Spain | Left-footed | No | 17 | 10 | 34 | 0.677836 | 0.166241 | 0.757061 | 50 | 71 | 15 | 30 | 6 | 0.820314 | 0.922727 | 8 | 11 |
1 | 2 | France | Left-footed | Yes | 14 | 13 | 31 | 0.544881 | 0.192774 | 0.796818 | 61 | 53 | 19 | 26 | 6 | 0.803321 | 0.678984 | 3 | 8 |
2 | 3 | Germany | Left-footed | No | 18 | 3 | 25 | 0.518180 | 0.160379 | 0.666869 | 65 | 60 | 20 | 24 | 3 | 0.766540 | 0.843858 | 8 | 9 |
3 | 4 | France | Right-footed | No | 22 | 9 | 20 | 0.599663 | 0.184602 | 0.638776 | 88 | 60 | 22 | 44 | 6 | 0.611798 | 0.662997 | 6 | 8 |
4 | 5 | France | Left-footed | Yes | 13 | 6 | 29 | 0.582982 | 0.105319 | 0.591485 | 75 | 54 | 13 | 37 | 8 | 0.701638 | 0.906538 | 8 | 6 |
Exploratory Data Analysis¶
Perform descriptive analysis¶
In [8]:
round(data.describe(), 2)
Out[8]:
Striker_ID | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | Hold-up Play | Aerial Duels Won | Defensive Contribution | Big Game Performance | Consistency | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 | 500.00 |
mean | 250.50 | 14.52 | 7.60 | 25.26 | 0.60 | 0.20 | 0.70 | 69.28 | 59.33 | 19.04 | 39.47 | 6.43 | 0.74 | 0.80 | 6.52 | 7.57 |
std | 144.48 | 4.91 | 2.96 | 7.08 | 0.10 | 0.05 | 0.10 | 10.33 | 10.15 | 4.95 | 9.90 | 1.98 | 0.10 | 0.10 | 2.11 | 2.09 |
min | 1.00 | 0.00 | 0.00 | 4.00 | 0.31 | 0.05 | 0.40 | 40.00 | 35.00 | 4.00 | 8.00 | 2.00 | 0.46 | 0.53 | 0.00 | 0.00 |
25% | 125.75 | 11.00 | 6.00 | 20.00 | 0.54 | 0.17 | 0.64 | 62.00 | 52.00 | 16.00 | 33.00 | 5.00 | 0.68 | 0.72 | 5.00 | 6.00 |
50% | 250.50 | 15.00 | 8.00 | 25.00 | 0.60 | 0.20 | 0.70 | 69.00 | 60.00 | 19.00 | 39.50 | 6.00 | 0.75 | 0.80 | 6.50 | 8.00 |
75% | 375.25 | 18.00 | 9.00 | 30.00 | 0.67 | 0.23 | 0.76 | 76.00 | 66.00 | 22.00 | 46.00 | 8.00 | 0.81 | 0.87 | 8.00 | 9.00 |
max | 500.00 | 34.00 | 15.00 | 43.00 | 0.92 | 0.36 | 1.00 | 98.00 | 92.00 | 34.00 | 71.00 | 12.00 | 1.00 | 1.00 | 13.00 | 13.00 |
Perform percentage analysis¶
In [9]:
freq_Footedness = data['Footedness'].value_counts()
perc_Footedness = freq_Footedness/len(data['Footedness'])*100
perc_Footedness
Out[9]:
Right-footed 53.4 Left-footed 46.6 Name: Footedness, dtype: float64
In [10]:
plt.figure(figsize=(12, 6))
perc_Footedness.plot(kind='pie', autopct='%1.2f%%')
plt.title('Percentage of strikers by their footedness')
plt.ylabel('')
plt.show()
Which nationality strikers have the highest average number of goals scored?¶
In [11]:
goals_by_nationality = data.groupby('Nationality')['Goals Scored'].mean().sort_values(ascending=False)
round(goals_by_nationality)
Out[11]:
Nationality Brazil 15.0 Spain 15.0 France 14.0 Germany 14.0 England 14.0 Name: Goals Scored, dtype: float64
What is the average conversion rate for players based on their footedness?¶
In [12]:
conversion_rate_by_footedness = data.groupby('Footedness')['Conversion Rate'].mean()
conversion_rate_by_footedness
Out[12]:
Footedness Left-footed 0.198086 Right-footed 0.200592 Name: Conversion Rate, dtype: float64
What is the distribution of players' footedness across different nationalities?¶
In [13]:
footedness_by_nationality = pd.crosstab(data['Nationality'], data['Footedness'])
footedness_by_nationality
Out[13]:
Footedness | Left-footed | Right-footed |
---|---|---|
Nationality | ||
Brazil | 42 | 53 |
England | 50 | 59 |
France | 42 | 51 |
Germany | 44 | 47 |
Spain | 55 | 57 |
In [14]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Nationality', hue='Footedness', data=data)
plt.title('Tistribution of players footedness across different nationalities')
plt.xlabel('Nationality')
plt.ylabel('Count')
plt.show()
Create a correlation matrix with a heatmap¶
In [15]:
num_variables = data.select_dtypes(include = ['number']).columns
correl_matrix = round(data[num_variables].corr(), 3)
correl_matrix
Out[15]:
Striker_ID | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | Hold-up Play | Aerial Duels Won | Defensive Contribution | Big Game Performance | Consistency | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Striker_ID | 1.000 | 0.025 | 0.066 | 0.003 | 0.018 | 0.047 | -0.031 | 0.060 | -0.036 | 0.016 | 0.012 | -0.052 | -0.017 | -0.006 | -0.027 | 0.103 |
Goals Scored | 0.025 | 1.000 | -0.071 | -0.059 | 0.066 | -0.011 | 0.034 | 0.016 | 0.003 | -0.047 | 0.014 | -0.004 | 0.045 | -0.021 | 0.103 | -0.053 |
Assists | 0.066 | -0.071 | 1.000 | 0.072 | -0.020 | 0.028 | 0.050 | 0.011 | -0.046 | 0.001 | -0.011 | 0.026 | -0.007 | -0.081 | -0.022 | -0.015 |
Shots on Target | 0.003 | -0.059 | 0.072 | 1.000 | -0.021 | -0.044 | 0.010 | -0.062 | -0.102 | -0.068 | 0.011 | 0.039 | 0.049 | 0.029 | 0.025 | 0.055 |
Shot Accuracy | 0.018 | 0.066 | -0.020 | -0.021 | 1.000 | -0.088 | 0.023 | -0.018 | -0.044 | 0.012 | 0.027 | -0.019 | 0.005 | 0.017 | 0.041 | 0.084 |
Conversion Rate | 0.047 | -0.011 | 0.028 | -0.044 | -0.088 | 1.000 | -0.020 | 0.038 | 0.059 | 0.059 | -0.002 | 0.037 | 0.031 | -0.049 | -0.031 | -0.004 |
Dribbling Success | -0.031 | 0.034 | 0.050 | 0.010 | 0.023 | -0.020 | 1.000 | -0.056 | 0.002 | -0.072 | 0.015 | -0.059 | -0.030 | -0.054 | -0.012 | -0.060 |
Movement off the Ball | 0.060 | 0.016 | 0.011 | -0.062 | -0.018 | 0.038 | -0.056 | 1.000 | -0.057 | 0.013 | -0.006 | -0.075 | -0.030 | -0.071 | -0.046 | -0.035 |
Hold-up Play | -0.036 | 0.003 | -0.046 | -0.102 | -0.044 | 0.059 | 0.002 | -0.057 | 1.000 | 0.080 | 0.029 | -0.031 | 0.147 | 0.068 | -0.046 | 0.035 |
Aerial Duels Won | 0.016 | -0.047 | 0.001 | -0.068 | 0.012 | 0.059 | -0.072 | 0.013 | 0.080 | 1.000 | -0.025 | -0.013 | 0.047 | -0.002 | 0.043 | 0.004 |
Defensive Contribution | 0.012 | 0.014 | -0.011 | 0.011 | 0.027 | -0.002 | 0.015 | -0.006 | 0.029 | -0.025 | 1.000 | -0.055 | -0.062 | 0.044 | -0.019 | 0.007 |
Big Game Performance | -0.052 | -0.004 | 0.026 | 0.039 | -0.019 | 0.037 | -0.059 | -0.075 | -0.031 | -0.013 | -0.055 | 1.000 | -0.001 | 0.005 | -0.002 | 0.053 |
Consistency | -0.017 | 0.045 | -0.007 | 0.049 | 0.005 | 0.031 | -0.030 | -0.030 | 0.147 | 0.047 | -0.062 | -0.001 | 1.000 | 0.024 | -0.035 | 0.099 |
Penalty Success Rate | -0.006 | -0.021 | -0.081 | 0.029 | 0.017 | -0.049 | -0.054 | -0.071 | 0.068 | -0.002 | 0.044 | 0.005 | 0.024 | 1.000 | 0.048 | -0.010 |
Impact on Team Performance | -0.027 | 0.103 | -0.022 | 0.025 | 0.041 | -0.031 | -0.012 | -0.046 | -0.046 | 0.043 | -0.019 | -0.002 | -0.035 | 0.048 | 1.000 | 0.006 |
Off-field Conduct | 0.103 | -0.053 | -0.015 | 0.055 | 0.084 | -0.004 | -0.060 | -0.035 | 0.035 | 0.004 | 0.007 | 0.053 | 0.099 | -0.010 | 0.006 | 1.000 |
In [16]:
plt.figure(figsize=(18, 10))
sns.heatmap(correl_matrix, annot=True)
plt.title('Heatmap of Correlation Matrix')
plt.show()
Statistical Test¶
Find whether there is any significant difference in consistency rates among strikers from various nationality¶
In [17]:
# Normality test
stat, p_value = shapiro(data['Consistency'])
print('P value: ', round(p_value, 3))
P value: 0.451
In [18]:
# Filtering data
Spain = data.query('Nationality == "Spain"')['Consistency']
France = data.query('Nationality == "France"')['Consistency']
Germany = data.query('Nationality == "Germany"')['Consistency']
Brazil = data.query('Nationality == "Brazil"')['Consistency']
England = data.query('Nationality == "England"')['Consistency']
In [19]:
# Levene test for statistics
stats, p_value = levene(Spain, France, Germany, Brazil, England)
print("P value: ", round(p_value, 3))
P value: 0.808
In [20]:
# One way ANOVA
Test_stat, p_value = f_oneway(Spain, France, Germany, Brazil, England)
print("P value: ", round(p_value, 2))
P value: 0.19
Check if there is any significant correlation between strikers' Hold-up play and consistency rate¶
In [21]:
# Normality test
stat, p_value = shapiro(data['Hold-up Play'])
print('P value: ', round(p_value, 3))
P value: 0.151
In [22]:
# Linearity test
plt.figure(figsize = (10, 6))
sns.regplot(x = 'Hold-up Play', y = 'Consistency', data = data)
plt.title('Linearity between Hold-up Play and Consistency')
plt.xlabel('Hold-up Play')
plt.ylabel('Consistency')
plt.show()
In [23]:
# Pearson correlation
HU_play = data['Hold-up Play']
Consistency = data['Consistency']
corr, p_value = pearsonr(HU_play, Consistency)
print("Correlation coefficient: ", round(corr, 3))
print("P value: ", round(p_value, 3))
Correlation coefficient: 0.147 P value: 0.001
Check if strikers' hold-up play significantly influences their consistency rate¶
In [24]:
x = data['Hold-up Play']
y = data['Consistency']
x_constant = sm.add_constant(x)
model = sm.OLS(y, x_constant).fit()
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: Consistency R-squared: 0.021 Model: OLS Adj. R-squared: 0.020 Method: Least Squares F-statistic: 10.93 Date: Thu, 18 Apr 2024 Prob (F-statistic): 0.00101 Time: 17:19:02 Log-Likelihood: 429.97 No. Observations: 500 AIC: -855.9 Df Residuals: 498 BIC: -847.5 Df Model: 1 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- const 0.6548 0.027 24.031 0.000 0.601 0.708 Hold-up Play 0.0015 0.000 3.306 0.001 0.001 0.002 ============================================================================== Omnibus: 1.708 Durbin-Watson: 2.135 Prob(Omnibus): 0.426 Jarque-Bera (JB): 1.744 Skew: -0.100 Prob(JB): 0.418 Kurtosis: 2.791 Cond. No. 358. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Feature Engineering¶
Create a new feature - Total contribution score¶
In [25]:
data['Total contribution score'] = (data['Goals Scored'] + data['Assists'] + data['Shots on Target'] + data['Dribbling Success'] + data['Aerial Duels Won'] + data['Defensive Contribution'] + data['Big Game Performance'] + data['Consistency'])
data.head()
Out[25]:
Striker_ID | Nationality | Footedness | Marital Status | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | Hold-up Play | Aerial Duels Won | Defensive Contribution | Big Game Performance | Consistency | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | Total contribution score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Spain | Left-footed | No | 17 | 10 | 34 | 0.677836 | 0.166241 | 0.757061 | 50 | 71 | 15 | 30 | 6 | 0.820314 | 0.922727 | 8 | 11 | 113.577376 |
1 | 2 | France | Left-footed | Yes | 14 | 13 | 31 | 0.544881 | 0.192774 | 0.796818 | 61 | 53 | 19 | 26 | 6 | 0.803321 | 0.678984 | 3 | 8 | 110.600139 |
2 | 3 | Germany | Left-footed | No | 18 | 3 | 25 | 0.518180 | 0.160379 | 0.666869 | 65 | 60 | 20 | 24 | 3 | 0.766540 | 0.843858 | 8 | 9 | 94.433410 |
3 | 4 | France | Right-footed | No | 22 | 9 | 20 | 0.599663 | 0.184602 | 0.638776 | 88 | 60 | 22 | 44 | 6 | 0.611798 | 0.662997 | 6 | 8 | 124.250575 |
4 | 5 | France | Left-footed | Yes | 13 | 6 | 29 | 0.582982 | 0.105319 | 0.591485 | 75 | 54 | 13 | 37 | 8 | 0.701638 | 0.906538 | 8 | 6 | 107.293123 |
Encode the Footedness and marital status by LabelEncoder¶
In [26]:
encoder = LabelEncoder()
data['Footedness'] = encoder.fit_transform(data['Footedness'])
data['Marital Status'] = encoder.fit_transform(data['Marital Status'])
data.head()
Out[26]:
Striker_ID | Nationality | Footedness | Marital Status | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | Hold-up Play | Aerial Duels Won | Defensive Contribution | Big Game Performance | Consistency | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | Total contribution score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Spain | 0 | 0 | 17 | 10 | 34 | 0.677836 | 0.166241 | 0.757061 | 50 | 71 | 15 | 30 | 6 | 0.820314 | 0.922727 | 8 | 11 | 113.577376 |
1 | 2 | France | 0 | 1 | 14 | 13 | 31 | 0.544881 | 0.192774 | 0.796818 | 61 | 53 | 19 | 26 | 6 | 0.803321 | 0.678984 | 3 | 8 | 110.600139 |
2 | 3 | Germany | 0 | 0 | 18 | 3 | 25 | 0.518180 | 0.160379 | 0.666869 | 65 | 60 | 20 | 24 | 3 | 0.766540 | 0.843858 | 8 | 9 | 94.433410 |
3 | 4 | France | 1 | 0 | 22 | 9 | 20 | 0.599663 | 0.184602 | 0.638776 | 88 | 60 | 22 | 44 | 6 | 0.611798 | 0.662997 | 6 | 8 | 124.250575 |
4 | 5 | France | 0 | 1 | 13 | 6 | 29 | 0.582982 | 0.105319 | 0.591485 | 75 | 54 | 13 | 37 | 8 | 0.701638 | 0.906538 | 8 | 6 | 107.293123 |
Create the dummies for Nationality and add with the data¶
In [27]:
dummies = pd.get_dummies(data['Nationality'])
processed_df = pd.concat([data, dummies], axis = 1)
processed_df = processed_df.drop('Nationality', axis = 1)
processed_df.head()
Out[27]:
Striker_ID | Footedness | Marital Status | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | ... | Consistency | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | Total contribution score | Brazil | England | France | Germany | Spain | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 17 | 10 | 34 | 0.677836 | 0.166241 | 0.757061 | 50 | ... | 0.820314 | 0.922727 | 8 | 11 | 113.577376 | 0 | 0 | 0 | 0 | 1 |
1 | 2 | 0 | 1 | 14 | 13 | 31 | 0.544881 | 0.192774 | 0.796818 | 61 | ... | 0.803321 | 0.678984 | 3 | 8 | 110.600139 | 0 | 0 | 1 | 0 | 0 |
2 | 3 | 0 | 0 | 18 | 3 | 25 | 0.518180 | 0.160379 | 0.666869 | 65 | ... | 0.766540 | 0.843858 | 8 | 9 | 94.433410 | 0 | 0 | 0 | 1 | 0 |
3 | 4 | 1 | 0 | 22 | 9 | 20 | 0.599663 | 0.184602 | 0.638776 | 88 | ... | 0.611798 | 0.662997 | 6 | 8 | 124.250575 | 0 | 0 | 1 | 0 | 0 |
4 | 5 | 0 | 1 | 13 | 6 | 29 | 0.582982 | 0.105319 | 0.591485 | 75 | ... | 0.701638 | 0.906538 | 8 | 6 | 107.293123 | 0 | 0 | 1 | 0 | 0 |
5 rows × 24 columns
Cluster Analysis¶
Perform KMeans clsutering¶
In [28]:
# Selecting features
x = processed_df.drop('Striker_ID', axis = 1)
# Calculating WCSS score
wcss = []
for i in range(1, 15):
kmeans = KMeans(n_clusters = i, init = 'k-means++')
kmeans.fit(x)
wcss_score = kmeans.inertia_
wcss.append(wcss_score)
In [29]:
# Plotting elbow chart
plt.figure(figsize = (12, 6))
plt.plot(range(1, 15), wcss, marker = 'o')
plt.title('Elbow methods')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.show()
In [30]:
# Building KMeans with k = 2
final_km = KMeans(n_clusters = 2)
final_km.fit(x)
# Generating labels
labels = final_km.labels_
labels
Out[30]:
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1], dtype=int32)
In [31]:
# Adding labels
processed_df['Clusters'] = labels
processed_df.head()
Out[31]:
Striker_ID | Footedness | Marital Status | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | ... | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | Total contribution score | Brazil | England | France | Germany | Spain | Clusters | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 17 | 10 | 34 | 0.677836 | 0.166241 | 0.757061 | 50 | ... | 0.922727 | 8 | 11 | 113.577376 | 0 | 0 | 0 | 0 | 1 | 1 |
1 | 2 | 0 | 1 | 14 | 13 | 31 | 0.544881 | 0.192774 | 0.796818 | 61 | ... | 0.678984 | 3 | 8 | 110.600139 | 0 | 0 | 1 | 0 | 0 | 1 |
2 | 3 | 0 | 0 | 18 | 3 | 25 | 0.518180 | 0.160379 | 0.666869 | 65 | ... | 0.843858 | 8 | 9 | 94.433410 | 0 | 0 | 0 | 1 | 0 | 1 |
3 | 4 | 1 | 0 | 22 | 9 | 20 | 0.599663 | 0.184602 | 0.638776 | 88 | ... | 0.662997 | 6 | 8 | 124.250575 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 5 | 0 | 1 | 13 | 6 | 29 | 0.582982 | 0.105319 | 0.591485 | 75 | ... | 0.906538 | 8 | 6 | 107.293123 | 0 | 0 | 1 | 0 | 0 | 1 |
5 rows × 25 columns
In [32]:
# Checking clusters
round(processed_df.groupby('Clusters')['Total contribution score'].mean(), 2)
Out[32]:
Clusters 0 123.39 1 101.90 Name: Total contribution score, dtype: float64
In [33]:
# Assigning meaningfull names
mapping = {0:'Best strikers', 1:'Regular strikers'}
processed_df['Strikers types'] = processed_df['Clusters'].map(mapping)
In [34]:
# Deleting the Clusters variable
processed_df = processed_df.drop('Clusters', axis = 1)
processed_df.head()
Out[34]:
Striker_ID | Footedness | Marital Status | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | ... | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | Total contribution score | Brazil | England | France | Germany | Spain | Strikers types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 17 | 10 | 34 | 0.677836 | 0.166241 | 0.757061 | 50 | ... | 0.922727 | 8 | 11 | 113.577376 | 0 | 0 | 0 | 0 | 1 | Regular strikers |
1 | 2 | 0 | 1 | 14 | 13 | 31 | 0.544881 | 0.192774 | 0.796818 | 61 | ... | 0.678984 | 3 | 8 | 110.600139 | 0 | 0 | 1 | 0 | 0 | Regular strikers |
2 | 3 | 0 | 0 | 18 | 3 | 25 | 0.518180 | 0.160379 | 0.666869 | 65 | ... | 0.843858 | 8 | 9 | 94.433410 | 0 | 0 | 0 | 1 | 0 | Regular strikers |
3 | 4 | 1 | 0 | 22 | 9 | 20 | 0.599663 | 0.184602 | 0.638776 | 88 | ... | 0.662997 | 6 | 8 | 124.250575 | 0 | 0 | 1 | 0 | 0 | Best strikers |
4 | 5 | 0 | 1 | 13 | 6 | 29 | 0.582982 | 0.105319 | 0.591485 | 75 | ... | 0.906538 | 8 | 6 | 107.293123 | 0 | 0 | 1 | 0 | 0 | Regular strikers |
5 rows × 25 columns
Data Preprocessing for ML¶
New feature mapping¶
In [35]:
mapping = {'Best strikers':1, 'Regular strikers': 0}
processed_df['Strikers types'] = processed_df['Strikers types'].map(mapping)
processed_df.head()
Out[35]:
Striker_ID | Footedness | Marital Status | Goals Scored | Assists | Shots on Target | Shot Accuracy | Conversion Rate | Dribbling Success | Movement off the Ball | ... | Penalty Success Rate | Impact on Team Performance | Off-field Conduct | Total contribution score | Brazil | England | France | Germany | Spain | Strikers types | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 17 | 10 | 34 | 0.677836 | 0.166241 | 0.757061 | 50 | ... | 0.922727 | 8 | 11 | 113.577376 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 2 | 0 | 1 | 14 | 13 | 31 | 0.544881 | 0.192774 | 0.796818 | 61 | ... | 0.678984 | 3 | 8 | 110.600139 | 0 | 0 | 1 | 0 | 0 | 0 |
2 | 3 | 0 | 0 | 18 | 3 | 25 | 0.518180 | 0.160379 | 0.666869 | 65 | ... | 0.843858 | 8 | 9 | 94.433410 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 4 | 1 | 0 | 22 | 9 | 20 | 0.599663 | 0.184602 | 0.638776 | 88 | ... | 0.662997 | 6 | 8 | 124.250575 | 0 | 0 | 1 | 0 | 0 | 1 |
4 | 5 | 0 | 1 | 13 | 6 | 29 | 0.582982 | 0.105319 | 0.591485 | 75 | ... | 0.906538 | 8 | 6 | 107.293123 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 25 columns
Selecting features¶
In [36]:
x = processed_df.drop(['Striker_ID', 'Strikers types'], axis = 1)
y = processed_df['Strikers types']
Scaling features¶
In [37]:
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
scaled_x
Out[37]:
array([[-1.07047781, -1.03252879, 0.5050467 , ..., -0.47801802, -0.47169258, 1.86125917], [-1.07047781, 0.968496 , -0.10638998, ..., 2.09197134, -0.47169258, -0.53727069], [-1.07047781, -1.03252879, 0.70885893, ..., -0.47801802, 2.12002488, -0.53727069], ..., [-1.07047781, 0.968496 , -0.10638998, ..., -0.47801802, -0.47169258, -0.53727069], [ 0.93416229, 0.968496 , -0.9216389 , ..., -0.47801802, -0.47169258, -0.53727069], [-1.07047781, -1.03252879, -1.32926335, ..., -0.47801802, -0.47169258, -0.53727069]])
Train test split¶
In [38]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_x, y, test_size = 0.2, random_state = 42)
Predictive Classification Analytics¶
Build a logistic regression machine learning model to predict strikers type¶
In [39]:
# Model training
lgr_model = LogisticRegression()
lgr_model.fit(x_train, y_train)
#Prediction
y_lgr_pred = lgr_model.predict(x_test)
# Evaluation
accuracy_lgr = accuracy_score(y_test, y_lgr_pred)
print(accuracy_lgr*100,'%')
97.0 %
In [40]:
# Creating confusion matrix
conf_matrix_lgr = confusion_matrix(y_test, y_lgr_pred)
# Plotting confusion matrix
plt.figure(figsize = (12, 6))
sns.heatmap(conf_matrix_lgr, annot = True, fmt = "d", cmap = "Blues")
plt.title('Confusion Matrix for LGR model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()