Home
Model About Datasets
In [50]:
import numpy as np
import warnings
import random

warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# model
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
In [51]:
data = pd.read_excel("onlinefoods.xlsx")
data.head(10)
Out[51]:
Age Gender Marital Status Occupation Monthly Income Educational Qualifications Family size latitude longitude Pin code Output Feedback
0 20 Female Single Student No Income Post Graduate 4 12.9766 77.5993 560001 Yes Positive
1 24 Female Single Student Below Rs.10000 Graduate 3 12.9770 77.5773 560009 Yes Positive
2 22 Male Single Student Below Rs.10000 Post Graduate 3 12.9551 77.6593 560017 Yes Negative
3 22 Female Single Student No Income Graduate 6 12.9473 77.5616 560019 Yes Positive
4 22 Male Single Student Below Rs.10000 Post Graduate 4 12.9850 77.5533 560010 Yes Positive
5 27 Female Married Employee More than 50000 Post Graduate 2 12.9299 77.6848 560103 Yes Positive
6 22 Male Single Student No Income Graduate 3 12.9770 77.5773 560009 Yes Positive
7 24 Female Single Student No Income Post Graduate 3 12.9828 77.6131 560042 Yes Positive
8 23 Female Single Student No Income Post Graduate 2 12.9766 77.5993 560001 Yes Positive
9 23 Female Single Student No Income Post Graduate 4 12.9854 77.7081 560048 Yes Positive
In [52]:
data.dtypes
Out[52]:
Age                             int64
Gender                         object
Marital Status                 object
Occupation                     object
Monthly Income                 object
Educational Qualifications     object
Family size                     int64
latitude                      float64
longitude                     float64
Pin code                        int64
Output                         object
Feedback                       object
dtype: object
In [53]:
data.isnull().sum()
Out[53]:
Age                           0
Gender                        0
Marital Status                0
Occupation                    0
Monthly Income                0
Educational Qualifications    0
Family size                   0
latitude                      0
longitude                     0
Pin code                      0
Output                        0
Feedback                      0
dtype: int64
In [54]:
plt.figure(figsize=(10, 8))
plt.title("Online Food Order Decisions Based on the Age of the Customer")
sns.countplot(x="Age", data=data, hue="Feedback", palette="Set3")
Out[54]:
<Axes: title={'center': 'Online Food Order Decisions Based on the Age of the Customer'}, xlabel='Age', ylabel='count'>
No description has been provided for this image
In [55]:
plt.figure(figsize=(10, 8))
plt.title("Online Food Order Decisions Based on the Size of the Family")
sns.countplot(x="Family size", data=data, hue="Feedback", palette="Set3_r")
Out[55]:
<Axes: title={'center': 'Online Food Order Decisions Based on the Size of the Family'}, xlabel='Family size', ylabel='count'>
No description has been provided for this image
In [56]:
plt.figure(figsize=(10, 8))
plt.title(
    "Online Food Order Decisions Based on the Educational Qualifications of the Customer"
)
sns.countplot(x="Educational Qualifications", data=data, hue="Feedback", palette="Set2")
Out[56]:
<Axes: title={'center': 'Online Food Order Decisions Based on the Educational Qualifications of the Customer'}, xlabel='Educational Qualifications', ylabel='count'>
No description has been provided for this image
In [57]:
plt.figure(figsize=(10, 8))
plt.title(
    "Online Food Order Decisions Based on the Educational Qualifications by the Occupation of the Customer"
)
sns.countplot(
    x="Educational Qualifications", data=data, hue="Occupation", palette="Set2_r"
)
Out[57]:
<Axes: title={'center': 'Online Food Order Decisions Based on the Educational Qualifications by the Occupation of the Customer'}, xlabel='Educational Qualifications', ylabel='count'>
No description has been provided for this image
In [59]:
female = len(data[data["Gender"] == "Female"])
male = len(data[data["Gender"] == "Male"])
print(male, female)
data_gender = [female, male]
labels = ["Female", "Male"]

colors = sns.color_palette("pastel")[0:5]  # seaborn color palette to use

plt.figure(figsize=(10, 8))
plt.title("Distribution of Customer's Gender")
plt.pie(data_gender, labels=labels, colors=colors, autopct="%.0f%%")
plt.show()
222 166
No description has been provided for this image
In [60]:
plt.figure(figsize=(10, 8))
plt.title("Educational Qualifications Based on the Gender of the Customer")
sns.countplot(x="Occupation", data=data, hue="Gender", palette="Pastel2")
Out[60]:
<Axes: title={'center': 'Educational Qualifications Based on the Gender of the Customer'}, xlabel='Occupation', ylabel='count'>
No description has been provided for this image
In [65]:
import seaborn as sns
import plotly.graph_objects as go

reorder = data[data["Feedback"] == "Positive"]

gender_data = reorder["Gender"].value_counts()

label = gender_data.index

counts = gender_data.values

print(label, counts)

colors = sns.color_palette("pastel")[2:4]


fig = go.Figure(data=[go.Pie(labels=label, values=counts)])

fig.update_layout(title_text="Which Gender is More Likely to Order Online Again?")

fig.update_traces(

    hoverinfo="label+percent", textinfo="value", marker=dict(colors=colors)
)


fig.show()
Index(['Male', 'Female'], dtype='object', name='Gender') [178 139]
In [67]:
plt.figure(figsize=(10, 8))
plt.title("Educational Qualifications Based on the Gender of the Customer")
sns.countplot(x="Monthly Income", data=data, hue="Feedback", palette="Pastel2_r")
Out[67]:
<Axes: title={'center': 'Educational Qualifications Based on the Gender of the Customer'}, xlabel='Monthly Income', ylabel='count'>
No description has been provided for this image
In [68]:
reorder = data[data["Feedback"] == "Positive"]
status_data = reorder["Marital Status"].value_counts()
label = status_data.index
counts = status_data.values

fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text="What is the Marital Status of Customers?")
fig.update_traces(
    hoverinfo="label+percent", textinfo="value", marker=dict(colors=colors)
)

fig.show()
In [31]:
data["Monthly Income"].unique()
Out[31]:
array(['No Income', 'Below Rs.10000', 'More than 50000', '10001 to 25000',
       '25001 to 50000'], dtype=object)
In [69]:
data["Gender"] = data["Gender"].map({"Male": 0, "Female": 1})  # male or female

data["Marital Status"] = data["Marital Status"].map(
    {"Married": 0, "Single": 1, "Prefer not to say": 2}
)


data["Occupation"] = data["Occupation"].replace(
    to_replace=["Employee", "Self Employeed"], value=1
)  # employed
data["Occupation"] = data["Occupation"].replace(
    to_replace=["Student", "House wife"], value=0
)  # unemployed


data["Educational Qualifications"] = data["Educational Qualifications"].map(
    {"Graduate": 1, "Post Graduate": 2, "Ph.D": 3, "School": 4, "Uneducated": 5}
)


data["Monthly Income"] = data["Monthly Income"].replace(
    to_replace=["No Income"], value=0
)  # no income
data["Monthly Income"] = data["Monthly Income"].replace(
    to_replace=[
        "Below Rs.10000",
        "More than 50000",
        "25001 to 50000",
        "10001 to 25000",
    ],
    value=1,
)  # has an income


data["Output"] = data["Output"].map({"No": 0, "Yes": 1})  # no or yes
In [70]:
data["Feedback"] = data["Feedback"].map(
    {"Negative ": 0, "Positive": 1}
)  # negative or positive
In [71]:
data.head()
Out[71]:
Age Gender Marital Status Occupation Monthly Income Educational Qualifications Family size latitude longitude Pin code Output Feedback
0 20 1 1 0 0 2 4 12.9766 77.5993 560001 1 1
1 24 1 1 0 1 1 3 12.9770 77.5773 560009 1 1
2 22 0 1 0 1 2 3 12.9551 77.6593 560017 1 0
3 22 1 1 0 0 1 6 12.9473 77.5616 560019 1 1
4 22 0 1 0 1 2 4 12.9850 77.5533 560010 1 1
In [72]:
X = data.drop("Feedback", axis=1)
y = data["Feedback"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=101
)

print("Shape of train dataset : ", X_train.shape)
print("Shape of test dataset : ", X_test.shape)
Shape of train dataset :  (310, 11)
Shape of test dataset :  (78, 11)
In [73]:
rfc = RandomForestClassifier(n_estimators=100)

rfc.fit(X_train, y_train)

print(rfc.score(X_test, y_test))
0.7948717948717948
In [74]:
pred = rfc.predict(X_test)

print(confusion_matrix(pred, y_test))

plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(pred, y_test), annot=True)
[[ 5  5]
 [11 57]]
Out[74]:
<Axes: >
No description has been provided for this image
In [75]:
# Evaluating a score by cross-validation
# cv determines the cross-validation splitting strategy
scores = cross_val_score(rfc, X_train, y_train, cv=5)

# average score
print("Accuracy: ", scores.mean(), scores.std() * 2)
Accuracy:  0.867741935483871 0.06255070783762999
In [76]:
parameters = {"randomforestclassifier__n_estimators": (20, 50, 100)}

pipeline = make_pipeline(RandomForestClassifier())

pipeline
Out[76]:
Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())])
RandomForestClassifier()
In [77]:
gridsearch = GridSearchCV(pipeline, parameters, verbose=1, n_jobs=-1)
In [78]:
gridsearch.fit(X_train, y_train)
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Out[78]:
GridSearchCV(estimator=Pipeline(steps=[('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__n_estimators': (20, 50, 100)},
             verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=Pipeline(steps=[('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__n_estimators': (20, 50, 100)},
             verbose=1)
Pipeline(steps=[('randomforestclassifier', RandomForestClassifier())])
RandomForestClassifier()
In [79]:
print("Best score %0.3f" % gridsearch.best_score_)
print("Best parameters set: ")
best_parameters = gridsearch.best_estimator_.get_params()

for params in sorted(parameters.keys()):
    print("\t%s: %r" % (params, best_parameters[params]))
Best score 0.877
Best parameters set: 
	randomforestclassifier__n_estimators: 100
In [81]:
r = random.randint(0, len(data))
print(r)
new_customer = data.drop("Feedback", axis=1).iloc[r]
new_customer
4
Out[81]:
Age                               22.0000
Gender                             0.0000
Marital Status                     1.0000
Occupation                         0.0000
Monthly Income                     1.0000
Educational Qualifications         2.0000
Family size                        4.0000
latitude                          12.9850
longitude                         77.5533
Pin code                      560010.0000
Output                             1.0000
Name: 4, dtype: float64
In [82]:
rfc.predict(new_customer.values.reshape(1, -1))  # predicted output
Out[82]:
array([1], dtype=int64)
In [83]:
data.iloc[r]["Feedback"]  # actual Output
Out[83]:
1.0
In [84]:
from sklearn.metrics import roc_curve, auc

# Compute predicted probabilities
y_pred_prob = rfc.predict_proba(X_test)[:, 1]

# Compute false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Compute the area under the ROC curve
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], "k--")  # Random guess line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [85]:
import joblib

# Save the model
joblib.dump(rfc, "model.pkl")
Out[85]:
['model.pkl']