import os
from pathlib import Path

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: path = Path('../input/titanic')
else:
    path = Path('titanic')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

import pandas as pd, numpy as np
train_df = pd.read_csv("./titanic/train.csv")
train_df.head()

train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

modes = train_df.mode().iloc[0]
modes

PassengerId                      1
Survived                       0.0
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

train_df.fillna(modes, inplace = True)

train_df.describe(include=[np.number])

train_df["Fare"].hist()
print("The fare has a pretty large tail. We want to smoothen this distribution.")

The fare has a pretty large tail. We want to smoothen this distribution.

train_df["Fare"] = np.log(train_df["Fare"]+1)
train_df["Fare"].hist()
print("We now have a nicer distribution that will be easier to work with.")

We now have a nicer distribution that will be easier to work with.

train_df.describe(include = [object])

train_df = pd.get_dummies(train_df, columns=["Sex","Cabin","Embarked"],dtype=float)
train_df.head()

train_df = train_df.drop(columns=["Name","Ticket"])
train_df

import torch
from torch import tensor

labels = tensor(train_df['Survived'], dtype = torch.long)
attributes = tensor(train_df.drop(columns=['Survived']).values, dtype= torch.float)

label_atts_map = list(zip(attributes, labels))

from torch.utils.data import random_split, DataLoader

train_size = int(len(label_atts_map)*0.8)
val_size = len(label_atts_map) - train_size

train, valid = random_split(label_atts_map, [train_size, val_size])

train_dl = DataLoader(train, batch_size = 64, shuffle = True)
valid_dl = DataLoader(train, batch_size = 64, shuffle = True)

xb, yb = next(iter(train_dl))
yb.shape

torch.Size([64])

import torch.nn as nn
import torch.optim as opt

model = nn.Sequential(
        nn.Linear(1*158,512),
        nn.ReLU(), 
        nn.Linear(512,256),
        nn.ReLU(),
        nn.Linear(256, 2),
)

learning_rate = 0.1

sgd = opt.SGD(model.parameters(), learning_rate)

loss_fn = nn.CrossEntropyLoss()

print("Parameters shape: ", next(model.parameters()).shape)

Parameters shape:  torch.Size([512, 158])

def validation():
    accuracy = []
    for xb, yb in valid_dl:
        predictions = torch.argmax(model(xb))
        accuracy.append((predictions == yb).float().mean())
    return round(torch.stack(accuracy).float().mean().item(),4)


def train_epoch():
    epoch_loss = np.array([])
    for xb, yb in train_dl:
        sgd.zero_grad()
        predictions = model(xb)
        loss = loss_fn(predictions, yb)
        loss.backward()
        epoch_loss = np.append(epoch_loss, loss.item())
        sgd.step()
    return epoch_loss

for ep in range(10):
    mean_loss = round(np.mean(train_epoch()),4)
    accuracy = round(validation(),4)
    print(f"Epoch #{ep} || Mean Loss: {round(mean_loss,3)} || Accuracy: {round(accuracy,3)}")

Epoch #0 || Mean Loss: 6.036624138778486e+18 || Accuracy: 0.6276
Epoch #1 || Mean Loss: 4.1381 || Accuracy: 0.6185
Epoch #2 || Mean Loss: 0.6586 || Accuracy: 0.6185
Epoch #3 || Mean Loss: 0.6694 || Accuracy: 0.6367
Epoch #4 || Mean Loss: 0.6694 || Accuracy: 0.6185
Epoch #5 || Mean Loss: 0.6695 || Accuracy: 0.6185
Epoch #6 || Mean Loss: 0.6613 || Accuracy: 0.6094
Epoch #7 || Mean Loss: 0.6563 || Accuracy: 0.5911
Epoch #8 || Mean Loss: 0.6651 || Accuracy: 0.6276
Epoch #9 || Mean Loss: 0.6698 || Accuracy: 0.6185

train_df = pd.read_csv("./titanic/train.csv")
train_df

def proc_data(df):
    df['Fare'] = df['Fare'].fillna(0)
    df.fillna(modes, inplace=True)
    df['LogFare'] = np.log1p(df['Fare'])
    df['Embarked'] = pd.Categorical(df['Embarked'])
    df['Sex'] = pd.Categorical(df['Sex'])

proc_data(train_df)

categoricals = ['Sex', 'Embarked']
continous = ['Age', 'SibSp', 'Parch', 'LogFare', 'Pclass']

dependent = 'Survived'

train_df[categoricals] = train_df[categoricals].apply(lambda x: x.cat.codes)

from numpy import random
from sklearn.model_selection import train_test_split

random.seed(42) # define our random seed to help make this repeatable

trn_df, val_df = train_test_split(train_df, test_size = 0.25)

def split_target_input(df):
    xs = df[categoricals+continous].copy()
    return xs, df[dependent]

trn_xs, trn_y = split_target_input(trn_df)
val_xs, val_y = split_target_input(val_df)

from sklearn.tree import DecisionTreeClassifier, plot_tree

tree = DecisionTreeClassifier(max_leaf_nodes = 4).fit(trn_xs, trn_y)

plot_tree(tree, feature_names = trn_xs.columns, filled=True)
print("A graph of our decision tree")

A graph of our decision tree

#Our metrics function
def mean_square_error(predictions, targets):
    return np.mean((predictions-targets)**2).item()

mean_square_error(tree.predict(val_xs), val_y)

0.2242152466367713

# A function that defines a bootstrapped tree

def get_tree(prop=0.75):
    length = len(trn_y)
    idxs = random.choice(length, int(length*prop)) # returns random indices to look at
    return DecisionTreeClassifier(min_samples_leaf = 5).fit(trn_xs.iloc[idxs], trn_y.iloc[idxs])

def random_forest(xs, length=100):
    preds = np.stack([get_tree().predict(xs) for _ in np.arange(length)])
    return np.mean(preds, 0)

preds = random_forest(val_xs)
mean_square_error(preds, val_y)

0.13712600896860988

# using sklearn
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(100, min_samples_leaf = 5)
rf.fit(trn_xs, trn_y)
mean_square_error(rf.predict(val_xs), val_y)

0.18385650224215247

title = "Feature importance of our random forest model"
plt = pd.DataFrame(dict(cols=trn_xs.columns, imp=rf.feature_importances_)).plot('cols', 'imp', 'barh', title = title)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	28.566970	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	13.199572	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	22.000000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	24.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	35.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	Name	Sex	Ticket	Cabin	Embarked
count	891	891	891	891	891
unique	891	2	681	147	3
top	Braund, Mr. Owen Harris	male	347082	B96 B98	S
freq	1	577	7	691	646

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...	...
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.0000	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.0000	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.4500	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.0000	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.7500	NaN	Q

Training A Classification Model For Titanic Survivors¶

Neural Network¶

Random Forests¶

Decision Trees¶

Preprocessing our data¶

Training Our Decision Tree¶

Bagging + Random Forests¶

Gradient Boosting Machine¶

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare	Sex_female	Sex_male	Cabin_A10	...	Cabin_F G73	Cabin_F2	Cabin_F33	Cabin_F38	Cabin_F4	Cabin_G6	Cabin_T	Embarked_C	Embarked_Q	Embarked_S
0	1	0	3	22.0	1	0	2.110213	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
1	2	1	1	38.0	1	0	4.280593	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
2	3	1	3	26.0	0	0	2.188856	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
3	4	1	1	35.0	1	0	3.990834	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
4	5	0	3	35.0	0	0	2.202765	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
886	887	0	2	27.0	0	0	2.639057	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
887	888	1	1	19.0	0	0	3.433987	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
888	889	0	3	24.0	1	2	3.196630	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
889	890	1	1	26.0	0	0	3.433987	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
890	891	0	3	32.0	0	0	2.169054	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0