import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('pet_adoption_data.csv')

df.head()

df.AdoptionLikelihood.value_counts()

AdoptionLikelihood
0    1348
1     659
Name: count, dtype: int64

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2007 entries, 0 to 2006
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PetID               2007 non-null   int64  
 1   PetType             2007 non-null   object 
 2   Breed               2007 non-null   object 
 3   AgeMonths           2007 non-null   int64  
 4   Color               2007 non-null   object 
 5   Size                2007 non-null   object 
 6   WeightKg            2007 non-null   float64
 7   Vaccinated          2007 non-null   int64  
 8   HealthCondition     2007 non-null   int64  
 9   TimeInShelterDays   2007 non-null   int64  
 10  AdoptionFee         2007 non-null   int64  
 11  PreviousOwner       2007 non-null   int64  
 12  AdoptionLikelihood  2007 non-null   int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 204.0+ KB

df.drop("PetID", axis=1, inplace=True)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

models = [
    LogisticRegression(max_iter=1000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    KNeighborsClassifier(),
    BernoulliNB()
]

# X y split
target_var = 'AdoptionLikelihood'
y = df[target_var]
X = df.drop(target_var, axis=1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Preprocessing
num_features = X.select_dtypes('number').columns
cat_features = X.select_dtypes('object').columns

num_proc = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_proc = make_pipeline(OneHotEncoder(drop='first')) # SimpleImputer(strategy='most_frequent'), 

preprocess = make_column_transformer((num_proc, num_features), (cat_proc, cat_features))

score = []
for model in models:
    pipe = make_pipeline(preprocess, model)
    grid = GridSearchCV(pipe, param_grid = {}, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    score.append(grid.best_score_)
    
score = pd.DataFrame(list(zip(models, score)), columns=['Model', 'Score'])
score.sort_values(by="Score", ascending=False).round(2)

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

param_grid_tree = {
    'decisiontreeclassifier__min_samples_leaf': [1, 2, 4]
    ,'decisiontreeclassifier__max_depth': [10, 50, None]
    }

pipe = make_pipeline(preprocess, DecisionTreeClassifier())

grid_tree = GridSearchCV(pipe, param_grid = param_grid_tree, cv=5, scoring='accuracy', n_jobs=-1)
grid_tree.fit(X_train, y_train)

print("Best score: ", grid_tree.best_score_)
print("Best params: ", grid_tree.best_params_)

Best score:  0.9266573462125063
Best params:  {'decisiontreeclassifier__max_depth': 10, 'decisiontreeclassifier__min_samples_leaf': 4}

prediction = grid_tree.best_estimator_.predict(X_train)
yy = y_train
baseline = y_test.value_counts(normalize=True).round(2)[0]

print('Baseline (Accuracy):', baseline) 
print('Accuracy:           ', round( accuracy_score(yy, prediction) , 4))
print('Precision:          ', round( precision_score(yy, prediction), 4))
print('Recall:             ', round( recall_score(yy, prediction)   , 4))
print('F1                  ', round( f1_score(yy, prediction)       , 4))   

cm = confusion_matrix(yy, prediction, labels=grid_tree.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=grid_tree.classes_)
disp.plot()
plt.title('\nConfusion Matrix\n', fontsize=14, weight='bold')
plt.show()

Baseline (Accuracy): 0.67
Accuracy:            0.9509
Precision:           0.9514
Recall:              0.8954
F1                   0.9226

# Save results in a dataframe for later comparison
df_results = pd.DataFrame({"Measure":["Accuracy", "Precision", "Recall", "F1"],
                           "Decision Tree": [accuracy_score(yy, prediction),
                                             precision_score(yy, prediction),
                                             recall_score(yy, prediction),
                                             f1_score(yy, prediction)]
                                             })
df_results.round(4)

param_grid_rf = {
    'randomforestclassifier__min_samples_leaf': [10, 20]
    ,'randomforestclassifier__max_depth': [10, 50, None]
    ,'randomforestclassifier__n_estimators': [10, 100, 200]
    }

pipe = make_pipeline(preprocess, RandomForestClassifier())

grid_rf = GridSearchCV(pipe, param_grid = param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("Best score: ", grid_rf.best_score_)
print("Best params: ", grid_rf.best_params_)

Best score:  0.9280782918149466
Best params:  {'randomforestclassifier__max_depth': 50, 'randomforestclassifier__min_samples_leaf': 10, 'randomforestclassifier__n_estimators': 200}

prediction = grid_rf.best_estimator_.predict(X_train)
yy = y_train
baseline = y_test.value_counts(normalize=True).round(2)[0]

print('Baseline (Accuracy):', baseline) 
print('Accuracy:           ', round( accuracy_score(yy, prediction) , 4))
print('Precision:          ', round( precision_score(yy, prediction), 4))
print('Recall:             ', round( recall_score(yy, prediction)   , 4))
print('F1                  ', round( f1_score(yy, prediction)       , 4))   

cm = confusion_matrix(yy, prediction, labels=grid_rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=grid_rf.classes_)
disp.plot()
plt.title('\nConfusion Matrix\n', fontsize=14, weight='bold')
plt.show()

Baseline (Accuracy): 0.67
Accuracy:            0.9452
Precision:           0.9614
Recall:              0.8671
F1                   0.9118

df_results["Random Forest"] = [accuracy_score(yy, prediction),
                                   precision_score(yy, prediction),
                                   recall_score(yy, prediction),
                                   f1_score(yy, prediction)]
df_results.round(4)

prediction = grid_tree.best_estimator_.predict(X_test)
yy = y_test
baseline = y_test.value_counts(normalize=True).round(2)[0]

print('Baseline (Accuracy):', baseline) 
print('Accuracy:           ', round( accuracy_score(yy, prediction) , 4))
print('Precision:          ', round( precision_score(yy, prediction), 4))
print('Recall:             ', round( recall_score(yy, prediction)   , 4))
print('F1                  ', round( f1_score(yy, prediction)       , 4))   

cm = confusion_matrix(yy, prediction, labels=grid_rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=grid_rf.classes_)
disp.plot()
plt.title('\nConfusion Matrix\n', fontsize=14, weight='bold')
plt.show()

Baseline (Accuracy): 0.67
Accuracy:            0.9237
Precision:           0.8812
Recall:              0.89
F1                   0.8856

	PetID	PetType	Breed	AgeMonths	Color	Size	WeightKg	Vaccinated	TimeInShelterDays	AdoptionFee	PreviousOwner
0	500	Bird	Parakeet	131	Orange	Large	5.039768	1	27	140	0
1	501	Rabbit	Rabbit	73	White	Large	16.086727	0	8	235	0
2	502	Dog	Golden Retriever	136	Orange	Medium	2.076286	0	85	385	0
3	503	Bird	Parakeet	97	White	Small	3.339423	0	61	217	1
4	504	Rabbit	Rabbit	123	Gray	Large	20.498100	0	28	14	1

Machine Learning - Classification¶

Predicting Pet Adoption Likelihood¶

Initial data exploration & cleaning¶

Import necessary modules from Scikit Learn¶

Define X and y¶

Train/Test Split and Preprocessing¶

Train models¶

Hyper Parameter Tuning¶

Hyper Parameter Tuning - Decision Tree¶

Hyper Parameter Tuning - Random Forest Classifier¶

Run model on Test Data¶

Conclusion:¶

	Model	Score
2	RandomForestClassifier()	0.93
0	LogisticRegression(max_iter=1000)	0.90
3	SVC()	0.90
1	DecisionTreeClassifier()	0.88
5	BernoulliNB()	0.86
4	KNeighborsClassifier()	0.82

	Measure	Decision Tree	Random Forest
0	Accuracy	0.9509	0.9452
1	Precision	0.9514	0.9614
2	Recall	0.8954	0.8671
3	F1	0.9226	0.9118