In [50]:
import numpy as np
import warnings
import random
warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
# model
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
In [51]:
data = pd.read_excel("onlinefoods.xlsx")
data.head(10)
Out[51]:
Age | Gender | Marital Status | Occupation | Monthly Income | Educational Qualifications | Family size | latitude | longitude | Pin code | Output | Feedback | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20 | Female | Single | Student | No Income | Post Graduate | 4 | 12.9766 | 77.5993 | 560001 | Yes | Positive |
1 | 24 | Female | Single | Student | Below Rs.10000 | Graduate | 3 | 12.9770 | 77.5773 | 560009 | Yes | Positive |
2 | 22 | Male | Single | Student | Below Rs.10000 | Post Graduate | 3 | 12.9551 | 77.6593 | 560017 | Yes | Negative |
3 | 22 | Female | Single | Student | No Income | Graduate | 6 | 12.9473 | 77.5616 | 560019 | Yes | Positive |
4 | 22 | Male | Single | Student | Below Rs.10000 | Post Graduate | 4 | 12.9850 | 77.5533 | 560010 | Yes | Positive |
5 | 27 | Female | Married | Employee | More than 50000 | Post Graduate | 2 | 12.9299 | 77.6848 | 560103 | Yes | Positive |
6 | 22 | Male | Single | Student | No Income | Graduate | 3 | 12.9770 | 77.5773 | 560009 | Yes | Positive |
7 | 24 | Female | Single | Student | No Income | Post Graduate | 3 | 12.9828 | 77.6131 | 560042 | Yes | Positive |
8 | 23 | Female | Single | Student | No Income | Post Graduate | 2 | 12.9766 | 77.5993 | 560001 | Yes | Positive |
9 | 23 | Female | Single | Student | No Income | Post Graduate | 4 | 12.9854 | 77.7081 | 560048 | Yes | Positive |
In [52]:
data.dtypes
Out[52]:
Age int64 Gender object Marital Status object Occupation object Monthly Income object Educational Qualifications object Family size int64 latitude float64 longitude float64 Pin code int64 Output object Feedback object dtype: object
In [53]:
data.isnull().sum()
Out[53]:
Age 0 Gender 0 Marital Status 0 Occupation 0 Monthly Income 0 Educational Qualifications 0 Family size 0 latitude 0 longitude 0 Pin code 0 Output 0 Feedback 0 dtype: int64
In [54]:
plt.figure(figsize=(10, 8))
plt.title("Online Food Order Decisions Based on the Age of the Customer")
sns.countplot(x="Age", data=data, hue="Feedback", palette="Set3")
Out[54]:
<Axes: title={'center': 'Online Food Order Decisions Based on the Age of the Customer'}, xlabel='Age', ylabel='count'>
In [55]:
plt.figure(figsize=(10, 8))
plt.title("Online Food Order Decisions Based on the Size of the Family")
sns.countplot(x="Family size", data=data, hue="Feedback", palette="Set3_r")
Out[55]:
<Axes: title={'center': 'Online Food Order Decisions Based on the Size of the Family'}, xlabel='Family size', ylabel='count'>
In [56]:
plt.figure(figsize=(10, 8))
plt.title(
"Online Food Order Decisions Based on the Educational Qualifications of the Customer"
)
sns.countplot(x="Educational Qualifications", data=data, hue="Feedback", palette="Set2")
Out[56]:
<Axes: title={'center': 'Online Food Order Decisions Based on the Educational Qualifications of the Customer'}, xlabel='Educational Qualifications', ylabel='count'>
In [57]:
plt.figure(figsize=(10, 8))
plt.title(
"Online Food Order Decisions Based on the Educational Qualifications by the Occupation of the Customer"
)
sns.countplot(
x="Educational Qualifications", data=data, hue="Occupation", palette="Set2_r"
)
Out[57]:
<Axes: title={'center': 'Online Food Order Decisions Based on the Educational Qualifications by the Occupation of the Customer'}, xlabel='Educational Qualifications', ylabel='count'>
In [59]:
female = len(data[data["Gender"] == "Female"])
male = len(data[data["Gender"] == "Male"])
print(male, female)
data_gender = [female, male]
labels = ["Female", "Male"]
colors = sns.color_palette("pastel")[0:5] # seaborn color palette to use
plt.figure(figsize=(10, 8))
plt.title("Distribution of Customer's Gender")
plt.pie(data_gender, labels=labels, colors=colors, autopct="%.0f%%")
plt.show()
222 166
In [60]:
plt.figure(figsize=(10, 8))
plt.title("Educational Qualifications Based on the Gender of the Customer")
sns.countplot(x="Occupation", data=data, hue="Gender", palette="Pastel2")
Out[60]:
<Axes: title={'center': 'Educational Qualifications Based on the Gender of the Customer'}, xlabel='Occupation', ylabel='count'>
In [65]:
import seaborn as sns
import plotly.graph_objects as go
reorder = data[data["Feedback"] == "Positive"]
gender_data = reorder["Gender"].value_counts()
label = gender_data.index
counts = gender_data.values
print(label, counts)
colors = sns.color_palette("pastel")[2:4]
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text="Which Gender is More Likely to Order Online Again?")
fig.update_traces(
hoverinfo="label+percent", textinfo="value", marker=dict(colors=colors)
)
fig.show()
Index(['Male', 'Female'], dtype='object', name='Gender') [178 139]
In [67]:
plt.figure(figsize=(10, 8))
plt.title("Educational Qualifications Based on the Gender of the Customer")
sns.countplot(x="Monthly Income", data=data, hue="Feedback", palette="Pastel2_r")
Out[67]:
<Axes: title={'center': 'Educational Qualifications Based on the Gender of the Customer'}, xlabel='Monthly Income', ylabel='count'>
In [68]:
reorder = data[data["Feedback"] == "Positive"]
status_data = reorder["Marital Status"].value_counts()
label = status_data.index
counts = status_data.values
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text="What is the Marital Status of Customers?")
fig.update_traces(
hoverinfo="label+percent", textinfo="value", marker=dict(colors=colors)
)
fig.show()
In [31]:
data["Monthly Income"].unique()
Out[31]:
array(['No Income', 'Below Rs.10000', 'More than 50000', '10001 to 25000', '25001 to 50000'], dtype=object)
In [69]:
data["Gender"] = data["Gender"].map({"Male": 0, "Female": 1}) # male or female
data["Marital Status"] = data["Marital Status"].map(
{"Married": 0, "Single": 1, "Prefer not to say": 2}
)
data["Occupation"] = data["Occupation"].replace(
to_replace=["Employee", "Self Employeed"], value=1
) # employed
data["Occupation"] = data["Occupation"].replace(
to_replace=["Student", "House wife"], value=0
) # unemployed
data["Educational Qualifications"] = data["Educational Qualifications"].map(
{"Graduate": 1, "Post Graduate": 2, "Ph.D": 3, "School": 4, "Uneducated": 5}
)
data["Monthly Income"] = data["Monthly Income"].replace(
to_replace=["No Income"], value=0
) # no income
data["Monthly Income"] = data["Monthly Income"].replace(
to_replace=[
"Below Rs.10000",
"More than 50000",
"25001 to 50000",
"10001 to 25000",
],
value=1,
) # has an income
data["Output"] = data["Output"].map({"No": 0, "Yes": 1}) # no or yes
In [70]:
data["Feedback"] = data["Feedback"].map(
{"Negative ": 0, "Positive": 1}
) # negative or positive
In [71]:
data.head()
Out[71]:
Age | Gender | Marital Status | Occupation | Monthly Income | Educational Qualifications | Family size | latitude | longitude | Pin code | Output | Feedback | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20 | 1 | 1 | 0 | 0 | 2 | 4 | 12.9766 | 77.5993 | 560001 | 1 | 1 |
1 | 24 | 1 | 1 | 0 | 1 | 1 | 3 | 12.9770 | 77.5773 | 560009 | 1 | 1 |
2 | 22 | 0 | 1 | 0 | 1 | 2 | 3 | 12.9551 | 77.6593 | 560017 | 1 | 0 |
3 | 22 | 1 | 1 | 0 | 0 | 1 | 6 | 12.9473 | 77.5616 | 560019 | 1 | 1 |
4 | 22 | 0 | 1 | 0 | 1 | 2 | 4 | 12.9850 | 77.5533 | 560010 | 1 | 1 |
In [72]:
X = data.drop("Feedback", axis=1)
y = data["Feedback"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=101
)
print("Shape of train dataset : ", X_train.shape)
print("Shape of test dataset : ", X_test.shape)
Shape of train dataset : (310, 11) Shape of test dataset : (78, 11)
In [73]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
print(rfc.score(X_test, y_test))
0.7948717948717948
In [74]:
pred = rfc.predict(X_test)
print(confusion_matrix(pred, y_test))
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(pred, y_test), annot=True)
[[ 5 5] [11 57]]
Out[74]:
<Axes: >
In [75]:
# Evaluating a score by cross-validation
# cv determines the cross-validation splitting strategy
scores = cross_val_score(rfc, X_train, y_train, cv=5)
# average score
print("Accuracy: ", scores.mean(), scores.std() * 2)
Accuracy: 0.867741935483871 0.06255070783762999
In [76]:
parameters = {"randomforestclassifier__n_estimators": (20, 50, 100)}
pipeline = make_pipeline(RandomForestClassifier())
pipeline
Out[76]:
Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())])
RandomForestClassifier()
In [77]:
gridsearch = GridSearchCV(pipeline, parameters, verbose=1, n_jobs=-1)
In [78]:
gridsearch.fit(X_train, y_train)
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Out[78]:
GridSearchCV(estimator=Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())]), n_jobs=-1, param_grid={'randomforestclassifier__n_estimators': (20, 50, 100)}, verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())]), n_jobs=-1, param_grid={'randomforestclassifier__n_estimators': (20, 50, 100)}, verbose=1)
Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())])
RandomForestClassifier()
In [79]:
print("Best score %0.3f" % gridsearch.best_score_)
print("Best parameters set: ")
best_parameters = gridsearch.best_estimator_.get_params()
for params in sorted(parameters.keys()):
print("\t%s: %r" % (params, best_parameters[params]))
Best score 0.877 Best parameters set: randomforestclassifier__n_estimators: 100
In [81]:
r = random.randint(0, len(data))
print(r)
new_customer = data.drop("Feedback", axis=1).iloc[r]
new_customer
4
Out[81]:
Age 22.0000 Gender 0.0000 Marital Status 1.0000 Occupation 0.0000 Monthly Income 1.0000 Educational Qualifications 2.0000 Family size 4.0000 latitude 12.9850 longitude 77.5533 Pin code 560010.0000 Output 1.0000 Name: 4, dtype: float64
In [82]:
rfc.predict(new_customer.values.reshape(1, -1)) # predicted output
Out[82]:
array([1], dtype=int64)
In [83]:
data.iloc[r]["Feedback"] # actual Output
Out[83]:
1.0
In [84]:
from sklearn.metrics import roc_curve, auc
# Compute predicted probabilities
y_pred_prob = rfc.predict_proba(X_test)[:, 1]
# Compute false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Compute the area under the ROC curve
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], "k--") # Random guess line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()
In [85]:
import joblib
# Save the model
joblib.dump(rfc, "model.pkl")
Out[85]:
['model.pkl']