# Data wrangling
import pandas as pd
import numpy as np

# Modeling
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import make_scorer

# Metrics
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr

# Plotting
import matplotlib.pyplot as plt
import plotly.express as px


data = pd.read_csv("https://raw.githubusercontent.com/niclasko/niclasko.github.io/master/data/cruise_ship_info.csv")

categorical = ["Ship_name", "Cruise_line"]
numerical = ["Age", "Tonnage", "passengers", "length", "cabins", "passenger_density", "crew"]
target_variable = "crew"
numerical_independent_variables = [i for i in numerical if i!=target_variable]


data.head(5)


data.describe()


one_hots = pd.get_dummies(data["Cruise_line"])
one_hot_variables = one_hots.columns.values.tolist()

data = data.join(one_hots)

data.head(5)


crew_hist = plt.hist(data.crew, bins=20, color="blue", edgecolor="k", alpha=0.65)
crew_quantiles = data.crew.quantile([.05,.95]).values
plt.axvline(crew_quantiles[0], color="y", linestyle="dashed", linewidth=2)
plt.text(crew_quantiles[0], 15, "5% percentile", rotation=90)
plt.axvline(crew_quantiles[1], color="y", linestyle="dashed", linewidth=2)
plt.text(crew_quantiles[1], 15, "95% percentile", rotation=90)
plt.axvline(data.crew.mode().values[0], color='r', linestyle="dashed", linewidth=2)
plt.text(data.crew.mode().values[0], 15, "mode", rotation=90)
plt.axvline(data.crew.mean(), color="black", linestyle="dashed", linewidth=2)
plt.text(data.crew.mean(), 15, "mean", rotation=90)
plt.xlabel("Crew Size")
plt.ylabel("Frequency")
plt.title("Ship crew size distribution")
plt.show()


corr = data[numerical].corr()
corr.style.background_gradient(cmap="BrBG")


fig = px.scatter_matrix(data,
    dimensions=numerical,
    color="crew",
    height=800
)
fig.show()


print("Independent variable correlation with target variable crew size")
pd.DataFrame(corr.loc[corr.index.values != "crew", "crew"].sort_values(ascending=False))\
    .rename(columns={"crew": "Crew size correlation"})

Independent variable correlation with target variable crew size


svd = TruncatedSVD(n_components=2, n_iter=200, random_state=42)
svd_projections = svd.fit_transform(
    data[numerical_independent_variables])

data_clustered = data[categorical+numerical].join(
    pd.DataFrame(svd_projections, columns=["svd_X", "svd_Y"]))

data_clustered.loc[:, "cluster"] =\
    KMeans(n_clusters=4, random_state=0).fit(data_clustered[["svd_X", "svd_Y"]]).labels_


pd.DataFrame(np.array(svd.explained_variance_ratio_.tolist() + [np.sum(svd.explained_variance_ratio_)]),
             index=["SVD Component 1 (svd_X)", "SVD Component 2 (svd_Y)", "Sum variance"],
            columns=["Explained variance"])


svd_variable_weights = pd.DataFrame(svd.components_.T,
             index=numerical_independent_variables,
             columns=["svd_X weights", "svd_Y weights"]
)
svd_variable_weights.style.background_gradient(cmap="BrBG")


fig = px.scatter(data_clustered,
                 x="svd_X", y="svd_Y",
                hover_data=categorical+numerical,
                size="crew",
                color="cluster",
                title="Numerical independent variables projected and clustered (K-Means) in SVD 2D space")
fig.show()


# Add interaction terms for numerical variables and Cruise_line one-hot-variables
for interaction in [(i,j) for i in numerical_independent_variables for j in one_hot_variables]:
    data.loc[:, interaction[0] + "*" + interaction[1]] = data[interaction[0]]*data[interaction[1]]

interaction_variables = [i for i in data.columns.values if "*" in i]
    
X_train, X_test, Y_train, Y_test = train_test_split(
    data.loc[:, numerical_independent_variables + interaction_variables],
    data.loc[:, target_variable],
    test_size=.4, random_state=42
)


def pearson(x, y):
    return pearsonr(x, y)[0]

def rmse(x, y):
    return np.sqrt(mean_squared_error(x, y))

scorer = {
    "Pearson": make_scorer(pearson, greater_is_better=True),
    "RMSE": make_scorer(rmse, greater_is_better=True),
    "R2": make_scorer(r2_score, greater_is_better=True)
}

metrics = []

for l2 in np.arange(.01, 2, step=.01):

    clf = Ridge(alpha=l2, max_iter=1000000)
    clf.fit(X_train, Y_train)
    
    cv_results = cross_validate(clf, X_train, Y_train, cv=5,
                                return_train_score=True, scoring=scorer)
    
    metrics.append([
        l2,
        np.mean(cv_results["train_RMSE"]),
        np.mean(cv_results["test_RMSE"]),
        np.mean(cv_results["train_Pearson"]),
        np.mean(cv_results["test_Pearson"]),
        np.mean(cv_results["train_R2"]),
        np.mean(cv_results["test_R2"])
    ])
    
metrics = pd.DataFrame(metrics,
                       columns=["L2 regularization strength",
                                "Train RMSE", "Validation RMSE",
                                "Train Pearson", "Validation Pearson",
                               "Train R2", "Validation R2"])

metrics.loc[:, "Abs diff Train/Validation RMSE"] = np.abs(metrics["Train RMSE"]-metrics["Validation RMSE"])
metrics.loc[:, "Abs diff Train/Validation Pearson"] = np.abs(metrics["Train Pearson"]-metrics["Validation Pearson"])
metrics.loc[:, "Abs diff Train/Validation R2"] = np.abs(metrics["Train R2"]-metrics["Validation R2"])

best_l2 = metrics[metrics["Abs diff Train/Validation Pearson"] ==\
        metrics.loc[:, "Abs diff Train/Validation Pearson"].min()][["L2 regularization strength"]].values[0]

clf = Ridge(alpha=best_l2, max_iter=1000000)
clf.fit(X_train, Y_train)

train_preds = clf.predict(X_train)
test_preds = clf.predict(X_test)

variable_weights = pd.DataFrame({"Variable": X_train.columns.values,
              "Regression coefficients": clf.coef_}).sort_values(
                by="Regression coefficients", ascending=False)


print("Top 5 positive model coefficients")
print(variable_weights.head(5))
print()
print("Bottom 5 negative model coefficients")
print(variable_weights.tail(5))

Top 5 positive model coefficients
            Variable  Regression coefficients
87   cabins*Carnival                 0.544874
3             length                 0.468848
67   length*Carnival                 0.447826
4             cabins                 0.338402
75  length*Norwegian                 0.245306

Bottom 5 negative model coefficients
                        Variable  Regression coefficients
115  passenger_density*Norwegian                -0.148219
95              cabins*Norwegian                -0.156479
107   passenger_density*Carnival                -0.203864
2                     passengers                -0.204591
55          passengers*Norwegian                -0.461542


fig, (ax1, ax2, ax3) = plt.subplots(3, sharex=True)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

fig.suptitle("Effects of L2 regularization on RMSE, Pearson and R^2 scores")

xticks = metrics["L2 regularization strength"]

ax1.plot(xticks, metrics["Train Pearson"], label="Train Pearson")
ax1.plot(xticks, metrics["Validation Pearson"], label="Validation Pearson")
ax1.axvline(best_l2, color="g", linestyle="dashed", linewidth=2)
ax1.set_ylabel("Pearson")
ax1.legend()

ax2.plot(xticks, metrics["Train RMSE"], label="Train RMSE")
ax2.plot(xticks, metrics["Validation RMSE"], label="Validation RMSE")
ax2.axvline(best_l2, color="g", linestyle="dashed", linewidth=2)
ax2.set_ylabel("RMSE")
ax2.legend()

ax3.plot(xticks, metrics["Train R2"], label="Train R2")
ax3.plot(xticks, metrics["Validation R2"], label="Validation R2")
ax3.axvline(best_l2, color="g", linestyle="dashed", linewidth=2)
ax3.set_xlabel("L2 regularization strength")
ax3.set_ylabel("R2")
ax3.legend()

plt.show()


# Function to draw best fit line for actuals vs. predicted
# Source: https://stackoverflow.com/questions/22104256/does-matplotlib-have-a-function-for-drawing-diagonal-lines-in-axis-coordinates
def add_identity(axes, *line_args, **line_kwargs):
    identity, = axes.plot([], [], *line_args, **line_kwargs)
    def callback(axes):
        low_x, high_x = axes.get_xlim()
        low_y, high_y = axes.get_ylim()
        low = max(low_x, low_y)
        high = min(high_x, high_y)
        identity.set_data([low, high], [low, high])
    callback(axes)
    axes.callbacks.connect('xlim_changed', callback)
    axes.callbacks.connect('ylim_changed', callback)
    return axes

fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle("Scatter plot of actuals vs. predicted")
ax1.scatter(Y_train, train_preds)
ax1.set_ylabel("Predicted crew size")
ax1.set_xlabel("Actual crew size")
ax1.set_title(f"Train Pearson: {np.round(pearsonr(Y_train, train_preds)[0],2)}")
add_identity(ax1, color='black', ls='--')

ax2.scatter(Y_test, test_preds)
ax1.set_ylabel("Predicted crew size")
ax2.set_xlabel("Actual crew size")
ax2.set_title(f"Test Pearson: {np.round(pearsonr(Y_test, test_preds)[0],2)}")
add_identity(ax2, color='black', ls='--')

plt.show()


print(f"Train Pearson: {pearsonr(Y_train, train_preds)[0]}")
print(f"Test Pearson: {pearsonr(Y_test, test_preds)[0]}")
print()
print(f"Train R^2: {r2_score(Y_train, train_preds)}")
print(f"Test R^2: {r2_score(Y_test, test_preds)}")
print()
print(f"Train RMSE: {rmse(Y_train, train_preds)}")
print(f"Test RMSE: {rmse(Y_test, test_preds)}")

Train Pearson: 0.9747282501552433
Test Pearson: 0.9538371244913997

Train R^2: 0.9500872833008763
Test R^2: 0.8972697396110219

Train RMSE: 0.7647542707295315
Test RMSE: 1.1392709254742084

	Ship_name	Cruise_line	Age	Tonnage	passengers	length	cabins	passenger_density	crew
0	Journey	Azamara	6	30.277	6.94	5.94	3.55	42.64	3.55
1	Quest	Azamara	6	30.277	6.94	5.94	3.55	42.64	3.55
2	Celebration	Carnival	26	47.262	14.86	7.22	7.43	31.80	6.70
3	Conquest	Carnival	11	110.000	29.74	9.53	14.88	36.99	19.10
4	Destiny	Carnival	17	101.353	26.42	8.92	13.21	38.36	10.00

	Age	Tonnage	passengers	length	cabins	passenger_density	crew
count	158.000000	158.000000	158.000000	158.000000	158.000000	158.000000	158.000000
mean	15.689873	71.284671	18.457405	8.130633	8.830000	39.900949	7.794177
std	7.615691	37.229540	9.677095	1.793474	4.471417	8.639217	3.503487
min	4.000000	2.329000	0.660000	2.790000	0.330000	17.700000	0.590000
25%	10.000000	46.013000	12.535000	7.100000	6.132500	34.570000	5.480000
50%	14.000000	71.899000	19.500000	8.555000	9.570000	39.085000	8.150000
75%	20.000000	90.772500	24.845000	9.510000	10.885000	44.185000	9.990000
max	48.000000	220.000000	54.000000	11.820000	27.000000	71.430000	21.000000

	Ship_name	Cruise_line	Age	Tonnage	passengers	length	cabins	passenger_density	crew	Azamara	...
0	Journey	Azamara	6	30.277	6.94	5.94	3.55	42.64	3.55	1	...
1	Quest	Azamara	6	30.277	6.94	5.94	3.55	42.64	3.55	1	...
2	Celebration	Carnival	26	47.262	14.86	7.22	7.43	31.80	6.70	0	...
3	Conquest	Carnival	11	110.000	29.74	9.53	14.88	36.99	19.10	0	...
4	Destiny	Carnival	17	101.353	26.42	8.92	13.21	38.36	10.00	0	...

	Age	Tonnage	passengers	length	cabins	passenger_density	crew
Age	1.000000	-0.606646	-0.515542	-0.532286	-0.510019	-0.278830	-0.530657
Tonnage	-0.606646	1.000000	0.945061	0.922368	0.948764	-0.040846	0.927569
passengers	-0.515542	0.945061	1.000000	0.883535	0.976341	-0.294867	0.915234
length	-0.532286	0.922368	0.883535	1.000000	0.889798	-0.090488	0.895857
cabins	-0.510019	0.948764	0.976341	0.889798	1.000000	-0.253181	0.950823
passenger_density	-0.278830	-0.040846	-0.294867	-0.090488	-0.253181	1.000000	-0.155509
crew	-0.530657	0.927569	0.915234	0.895857	0.950823	-0.155509	1.000000

	svd_X weights	svd_Y weights
Age	0.139450	-0.450466
Tonnage	0.866058	0.405934
passengers	0.221644	0.120804
length	0.088451	-0.061696
cabins	0.105571	0.048212
passenger_density	0.402993	-0.782032

Cruise Ship crew size analysis¶

Import libraries¶

Read data¶

One-hot encode categorical variable Cruise_line¶

Exploratory Data Analysis¶

Target variable analysis¶

Independent variable analysis¶

Clustering analysis¶

Modeling¶

	Explained variance
SVD Component 1 (svd_X)	0.714321
SVD Component 2 (svd_Y)	0.239685
Sum variance	0.954006