# Importing some libraries we're going to need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from pandas.plotting import scatter_matrix


# Looking at what files we've got and getting their paths
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/formula-1-world-championship-1950-2020/races.csv
/kaggle/input/formula-1-world-championship-1950-2020/constructor_results.csv
/kaggle/input/formula-1-world-championship-1950-2020/drivers.csv
/kaggle/input/formula-1-world-championship-1950-2020/constructors.csv
/kaggle/input/formula-1-world-championship-1950-2020/lap_times.csv
/kaggle/input/formula-1-world-championship-1950-2020/status.csv
/kaggle/input/formula-1-world-championship-1950-2020/driver_standings.csv
/kaggle/input/formula-1-world-championship-1950-2020/seasons.csv
/kaggle/input/formula-1-world-championship-1950-2020/pit_stops.csv
/kaggle/input/formula-1-world-championship-1950-2020/sprint_results.csv
/kaggle/input/formula-1-world-championship-1950-2020/constructor_standings.csv
/kaggle/input/formula-1-world-championship-1950-2020/results.csv
/kaggle/input/formula-1-world-championship-1950-2020/circuits.csv
/kaggle/input/formula-1-world-championship-1950-2020/qualifying.csv


results = pd.read_csv('/kaggle/input/formula-1-world-championship-1950-2020/results.csv')
driver_standings = pd.read_csv('/kaggle/input/formula-1-world-championship-1950-2020/driver_standings.csv')
constructor_standings = pd.read_csv('/kaggle/input/formula-1-world-championship-1950-2020/constructor_standings.csv')


results.head()


results.dtypes

resultId             int64
raceId               int64
driverId             int64
constructorId        int64
number              object
grid                 int64
position            object
positionText        object
positionOrder        int64
points             float64
laps                 int64
time                object
milliseconds        object
fastestLap          object
rank                object
fastestLapTime      object
fastestLapSpeed     object
statusId             int64
dtype: object


driver_standings.head()


driver_standings.dtypes

driverStandingsId      int64
raceId                 int64
driverId               int64
points               float64
position               int64
positionText          object
wins                   int64
dtype: object


constructor_standings.head()


constructor_standings.dtypes

constructorStandingsId      int64
raceId                      int64
constructorId               int64
points                    float64
position                    int64
positionText               object
wins                        int64
dtype: object


# We only need a few columns, which we're getting here
results = results[["raceId", "driverId", "constructorId", "grid", "position"]]
results.head()


driver_standings = driver_standings[["raceId", "driverId", "position"]]
# Rename the "position" column do avoid conflict with the "position" column from results.csv
driver_standings = driver_standings.rename(columns={"position": "driverStanding"})
# Use current driver standings for the next race
driver_standings["raceId"] += 1
driver_standings.head()


# Again, picking the columns we need and renaming "position"
constructor_standings = constructor_standings[["raceId", "constructorId", "position"]]
constructor_standings = constructor_standings.rename(columns={"position": "constructorStanding"})
# Use current constructor standings for the next race
constructor_standings["raceId"] += 1
constructor_standings.head()


# Joining results with driver standings. This will add the "driverPosition" column to our results
results_driver_standings = pd.merge(results, driver_standings, on=["raceId", "driverId"], how="inner")
results_driver_standings.head()


# Now we join the constructor standings and we end up with everything we need in one place
joined_data = pd.merge(results_driver_standings, constructor_standings, on=["raceId", "constructorId"], how="inner")
joined_data.head()


joined_data.sort_values(by='raceId', ascending=False).head(60)


# Let's see how many examples do did we end up with
len(joined_data)

22158


joined_data[["grid", "driverStanding", "constructorStanding", "position"]].hist(figsize=(12, 8))
plt.show()


joined_data[["grid", "driverStanding", "constructorStanding", "position"]].agg(['min', 'max'])


# Finding out the first race ID of the year 2013
# Doing this so that we can get rid of races oleder than 10 years from our dataset
races = pd.read_csv('/kaggle/input/formula-1-world-championship-1950-2020/races.csv')
races.loc[races['year'] == 2013].sort_values(by="raceId", ascending=True).head()
# The answer is 880


# Removing all races before 2013 from our dataset
joined_data = joined_data[joined_data["raceId"] > 880]
joined_data.head()


joined_data[["grid", "driverStanding", "constructorStanding", "position"]].hist(figsize=(12, 8))
plt.show()


# Getting rid of a few more columns that we don't need anymore
dataset = joined_data[["grid", "driverStanding", "constructorStanding", "position"]]
joined_data.head()


dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3916 entries, 18241 to 22157
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   grid                 3916 non-null   int64 
 1   driverStanding       3916 non-null   int64 
 2   constructorStanding  3916 non-null   int64 
 3   position             3916 non-null   object
dtypes: int64(3), object(1)
memory usage: 153.0+ KB


# Filter out rows where the position is not numeric (remember the "\N" before?)
dataset = dataset[dataset.position.apply(lambda x: x.isnumeric())]
# Filter out rows where grid is 0
dataset = dataset[dataset.grid.apply(lambda x: x > 0)]
# Change type for position values to integers
dataset.position = dataset.position.astype('int')

dataset


dataset.corr()["position"]

grid                   0.750109
driverStanding         0.742595
constructorStanding    0.750760
position               1.000000
Name: position, dtype: float64


scatter_matrix(dataset, figsize=(12,8))
plt.show()


fig,ax = plt.subplots(1, 3, figsize=(12, 3))

# https://stackoverflow.com/a/20105673/3015186
max_grid = dataset.grid.max()
max_position = dataset.position.max()
max_d_position = dataset.driverStanding.max()
max_c_position = dataset.constructorStanding.max()
print(f"max_grid = {max_grid}; max_position = {max_position}; max_d_position = {max_d_position}; max_c_position = {max_c_position}")

ax[0].hist2d(dataset.grid, dataset.position, (max_grid, max_position), cmap='plasma', cmin=1)
ax[0].set_xlabel("Grid")
ax[0].set_ylabel("Final position")
ax[1].hist2d(dataset.driverStanding, dataset.position, (max_d_position, max_position), cmap='plasma', cmin=1)
ax[1].set_xlabel("Driver standing")
ax[2].hist2d(dataset.constructorStanding, dataset.position, (max_c_position, max_position), cmap='plasma', cmin=1)
ax[2].set_xlabel("Constructor standing")

plt.show()

max_grid = 22; max_position = 22; max_d_position = 24; max_c_position = 11


# Split dataset in training X and y
x_train = dataset[['grid', 'driverStanding', 'constructorStanding']].values
y_train = dataset[['position']].values.reshape(-1) # Transforming to a 1D array
print(f'{x_train.shape}; {y_train.shape}')
print(x_train)
print(y_train)

(3258, 3); (3258,)
[[ 1  3  3]
 [ 5  6  3]
 [ 4  5  4]
 ...
 [20 20 10]
 [12 16  8]
 [16 13  8]]
[ 1  2  3 ... 19 16 17]


def compute_cost(x, y, w, b):
    # Get number of examples
    m = x.shape[0]
    # Initialise cost
    cost = 0
    for i in range(m):
        f_wb_i = x[i].dot(w) + b
        cost += (f_wb_i - y[i]) ** 2
    return cost / (2 * m)


w_init = [0, 0, 0]
b_init = 0


# Calculate cost for initial w and b
cost = compute_cost(x_train, y_train, w_init, b_init)
print(f"Cost = {cost}")

Cost = 55.43278084714549


# Spoiler: Here are the trained weights and bias
cost = compute_cost(x_train, y_train, [0.35008161, 0.04341071, 0.27482694], 2.0513445619234765)
print(f"Cost = {cost}")

Cost = 6.637608134053078


def compute_gradient(x, y, w, b):
    # Get number of examples and features
    m, n = x.shape
    # Initialise gradient of the cost w.r.t the parameters w
    dj_dw = np.zeros((n,))
    # Initialise gradient of the cost w.r.t the parameter b
    dj_db = 0.
    
    for i in range(m):
        loss = (x[i].dot(w) + b) - y[i] # Loss is the same for both derivatives
        for j in range (n):
            dj_dw[j] += loss * x[i][j] 
        dj_db += loss
    dj_dw = dj_dw / m
    dj_db = dj_db / m
    return dj_dw, dj_db


# Let's try to compute the gradient for our initial weights and bias
compute_gradient(x_train, y_train, w_init, b_init)

(array([-118.62400246, -120.44229589,  -61.77931246]), -9.169429097605892)


def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters):
    # Store history of costs (J). This will be used when we print out the progress.
    J_history = []

    w = copy.deepcopy(w_in)
    b = b_in
    
    # Apply gradient descent num_iters times
    for i in range(num_iters):
        dj_dw, dj_db = compute_gradient(X, y, w, b)
        # Update w and b based on gradient, at the same time (this is important)
        w = w - alpha * dj_dw
        b = b - alpha * dj_db
        
        J_history.append(compute_cost(X, y, w, b))
            
        # Print cost every at intervals 10 times or as many iterations if < 10
        if i % math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]:8.8f}   ")
    
    return w, b, J_history


import copy, math

# Initial values taken from a previous descent, so as not to start from 0
initial_w = np.array([0.37888087, 0.05009108, 0.32510837])
initial_b = 2.7796157637505052
iterations = 10000
alpha = 3.5e-3

print("Running gradient descent...")
w_final, b_final, J_hist = gradient_descent(x_train, y_train, initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations)
print(f"b = {b_final}; w = {w_final} ")

# Expecting something close to: b = 1.1346913859860654; w = [0.34919898 0.16862326 0.47748792]

Running gradient descent...
Iteration    0: Cost 4.98900555   
Iteration 1000: Cost 4.68054531   
Iteration 2000: Cost 4.62995299   
Iteration 3000: Cost 4.61832210   
Iteration 4000: Cost 4.61564821   
Iteration 5000: Cost 4.61503350   
Iteration 6000: Cost 4.61489218   
Iteration 7000: Cost 4.61485969   
Iteration 8000: Cost 4.61485222   
Iteration 9000: Cost 4.61485050   
b = 1.1346913859860654; w = [0.34919898 0.16862326 0.47748792]


for i in range(10):
    f = x_train[i+100].dot(w_final) + b_final
    prediction = np.round(f).astype(int)
    actual = y_train[i+100]
    print(f"Prediction: {prediction:3d}, Actual position: {actual:3d}, Accuracy: {100 - (abs(prediction - actual) / actual) * 100.0:3.0f}%")

Prediction:   5, Actual position:   5, Accuracy: 100%
Prediction:   9, Actual position:   6, Accuracy:  50%
Prediction:  11, Actual position:  15, Accuracy:  73%
Prediction:  11, Actual position:   7, Accuracy:  43%
Prediction:   8, Actual position:  10, Accuracy:  80%
Prediction:   6, Actual position:   9, Accuracy:  67%
Prediction:  12, Actual position:  13, Accuracy:  92%
Prediction:  10, Actual position:  11, Accuracy:  91%
Prediction:  11, Actual position:  12, Accuracy:  92%
Prediction:   9, Actual position:  14, Accuracy:  64%


differences = np.zeros(x_train.shape[0])
for i in range(x_train.shape[0]):
    f = x_train[i].dot(w_final) + b_final
    prediction = np.round(f).astype(int)
    actual = y_train[i]
    differences[i] = abs(prediction-actual)
print(f"Average difference: {np.average(differences):2.0f}")

Average difference:  2


def predict(grid, driver_standing, constructor_standing):
    prediction = np.array([grid, driver_standing, constructor_standing]).dot([0.34919898, 0.16862326, 0.47748792]) + 1.1346913859860654
    return np.round(prediction).astype(int)


predict(3, 5, 2)

4

	raceId	driverId	constructorId	grid	position	driverStanding	constructorStanding
22157	1096	825	210	16	17	13	8
22147	1096	4	214	10	\N	9	4
22138	1096	830	9	1	1	1	1
22139	1096	815	9	2	3	3	1
22140	1096	844	6	3	2	2	2
22141	1096	832	6	4	4	6	2
22142	1096	847	131	6	5	4	3
22144	1096	846	1	7	6	7	5
22145	1096	817	1	13	9	12	5
22146	1096	839	214	8	7	8	4
22143	1096	1	131	5	18	5	3
22148	1096	840	117	14	8	15	7
22153	1096	822	51	18	15	10	6
22149	1096	20	117	9	10	11	7
22155	1096	849	3	20	19	20	10
22154	1096	848	3	19	13	19	10
22156	1096	854	210	12	16	16	8
22152	1096	855	51	15	12	18	6
22151	1096	842	213	17	14	14	9
22150	1096	852	213	11	11	17	9
22127	1095	855	51	13	12	18	6
22118	1095	847	131	1	1	4	3
22119	1095	1	131	2	2	5	3
22120	1095	832	6	7	3	6	2
22121	1095	844	6	5	4	3	2
22122	1095	4	214	17	5	9	4
22123	1095	839	214	16	8	8	4
22124	1095	830	9	3	6	1	1
22125	1095	815	9	4	7	2	1
22126	1095	822	51	14	9	10	6
22133	1095	852	213	0	17	17	9
22128	1095	840	117	15	10	15	7
22134	1095	848	3	19	15	19	10
22129	1095	20	117	9	11	11	7
22136	1095	846	1	6	\N	7	5
22135	1095	849	3	18	16	20	10
22137	1095	817	1	11	\N	12	5
22132	1095	842	213	10	14	14	9
22131	1095	825	210	8	\N	13	8
22130	1095	854	210	12	13	16	8
22107	1094	4	214	9	19	9	4
22098	1094	830	9	1	1	1	1
22099	1094	815	9	4	3	3	1
22100	1094	1	131	3	2	6	3
22101	1094	847	131	2	4	4	3
22102	1094	832	6	5	5	5	2
22103	1094	844	6	7	6	2	2
22104	1094	817	1	11	7	12	5
22105	1094	846	1	8	9	7	5
22106	1094	839	214	10	8	8	4
22113	1094	849	3	18	18	20	10
22108	1094	822	51	6	10	10	6
22114	1094	20	117	16	14	11	7
22109	1094	855	51	12	13	18	6
22116	1094	854	210	15	16	16	8
22115	1094	840	117	20	15	15	7
22117	1094	825	210	19	17	13	8
22112	1094	848	3	17	12	19	10
22111	1094	852	213	13	\N	17	9
22110	1094	842	213	14	11	14	9

Linear regression from scratch: Predicting F1 results

Before we start¶

Prologue¶

The plan¶

Getting the data¶

Quick look at the data¶

Dropping the columns we don't need¶

Joining the data¶

Sense checking the data¶

Quick plot¶

Cleaning the data¶

Looking at correlations¶

Getting ready for training¶

The Model¶

Epilogue¶

	resultId	raceId	driverId	constructorId	number	grid	position	positionText	positionOrder	points	laps	time	milliseconds	fastestLap	rank	fastestLapTime	fastestLapSpeed	statusId
0	1	18	1	1	22	1	1	1	1	10.0	58	1:34:50.616	5690616	39	2	1:27.452	218.300	1
1	2	18	2	2	3	5	2	2	2	8.0	58	+5.478	5696094	41	3	1:27.739	217.586	1
2	3	18	3	3	7	7	3	3	3	6.0	58	+8.163	5698779	41	5	1:28.090	216.719	1
3	4	18	4	4	5	11	4	4	4	5.0	58	+17.181	5707797	58	7	1:28.603	215.464	1
4	5	18	5	1	23	3	5	5	5	4.0	58	+18.014	5708630	43	1	1:27.418	218.385	1

	raceId	year	round	circuitId	name	date	time	url	fp1_date	fp1_time	fp2_date	fp2_time	fp3_date	fp3_time	quali_date	quali_time	sprint_date	sprint_time
878	880	2013	1	1	Australian Grand Prix	2013-03-17	06:00:00	http://en.wikipedia.org/wiki/2013_Australian_G...	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
879	881	2013	2	2	Malaysian Grand Prix	2013-03-24	08:00:00	http://en.wikipedia.org/wiki/2013_Malaysian_Gr...	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
880	882	2013	3	17	Chinese Grand Prix	2013-04-14	07:00:00	http://en.wikipedia.org/wiki/2013_Chinese_Gran...	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
881	883	2013	4	3	Bahrain Grand Prix	2013-04-21	12:00:00	http://en.wikipedia.org/wiki/2013_Bahrain_Gran...	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
882	884	2013	5	4	Spanish Grand Prix	2013-05-12	12:00:00	http://en.wikipedia.org/wiki/2013_Spanish_Gran...	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N

	raceId	driverId	constructorId	grid	position	driverStanding	constructorStanding
18241	881	20	9	1	1	3	3
18242	881	17	9	5	2	6	3
18243	881	1	131	4	3	5	4
18244	881	3	131	6	4	20	4
18245	881	13	6	2	5	4	1