Importing Libraries

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import nltk


import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.cross_validation import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ketan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


C:\Users\ketan\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\Users\ketan\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

Reading Data

reviews = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
reviews.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Review	Liked
0	Wow... Loved this place.	1
1	Crust is not good.	0
2	Not tasty and the texture was just nasty.	0
3	Stopped by during the late May bank holiday of...	1
4	The selection on the menu was great and so wer...	1

Preprocessing of data

# Cleaning the texts
corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ', reviews['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

Creating bag of words model

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = reviews.iloc[:, 1].values

Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

def resultPrintHelper(classifier, X_test, y_test):
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    print("Accuracy score is: {}".format(accuracy_score(y_test, y_pred)))
    print("Precision score is: {}".format(precision_score(y_test, y_pred)))
    print("Recall score is: {}".format(recall_score(y_test, y_pred)))
    print("F1 score is: {}".format(f1_score(y_test, y_pred)))
    print("------Confusion Matirx------")
    print(confusion_matrix(y_test, y_pred))

Classifiers

1. Naive Bayes

# Fitting Naive Bayes to the Training set
bayesClassifier = GaussianNB()
bayesClassifier.fit(X_train, y_train)
resultPrintHelper(bayesClassifier, X_test, y_test)

Accuracy score is: 0.73
Precision score is: 0.6842105263157895
Recall score is: 0.883495145631068
F1 score is: 0.7711864406779663
------Confusion Matirx------
[[55 42]
 [12 91]]

2. Decision Tree

dstClassifier = DecisionTreeClassifier()
dstClassifier.fit(X_train, y_train)
resultPrintHelper(dstClassifier, X_test, y_test)

Accuracy score is: 0.67
Precision score is: 0.7176470588235294
Recall score is: 0.5922330097087378
F1 score is: 0.648936170212766
------Confusion Matirx------
[[73 24]
 [42 61]]

3. Random Forest tree

rftClassifier = RandomForestClassifier(n_estimators=1000, n_jobs = -1, random_state=42)
rftClassifier.fit(X_train, y_train)
resultPrintHelper(rftClassifier, X_test, y_test)

Accuracy score is: 0.705
Precision score is: 0.8333333333333334
Recall score is: 0.5339805825242718
F1 score is: 0.650887573964497
------Confusion Matirx------
[[86 11]
 [48 55]]

Name		Name	Last commit message	Last commit date
Latest commit History 5 Commits
.ipynb_checkpoints		.ipynb_checkpoints
README.md		README.md
Restaurant_Reviews.tsv		Restaurant_Reviews.tsv
ReviewClassification.ipynb		ReviewClassification.ipynb

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Importing Libraries

Reading Data

Preprocessing of data

Creating bag of words model

Splitting the dataset into the Training set and Test set

Classifiers

1. Naive Bayes

2. Decision Tree

3. Random Forest tree

About

Uh oh!

Releases

Packages

Languages

Thek10patil/Review-Classification

Folders and files

Latest commit

History

Repository files navigation

Importing Libraries

Reading Data

Preprocessing of data

Creating bag of words model

Splitting the dataset into the Training set and Test set

Classifiers

1. Naive Bayes

2. Decision Tree

3. Random Forest tree

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages