Scatter Plot. (1-10-13-22)
import pandas as pd
import matplotlib.pyplot as plt
iris = pd.read_csv("Iris.csv")
plt.plot(iris.Id, iris["SepalLengthCm"], "r--")
plt.show
scatter Pot 2 (1-10-13-22)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# Loading the Data
iris = pd.read_csv('Iris.csv')
#Checking the head of Dataset
iris.head(5)
#Checking the column info
iris.info()
#to check the statisics for the iris dataset
iris.describe()
iris.groupby('Species').min()
iris.groupby('Species').max()
#Checking is there is any null data in iris dataset
iris.isnull().any()
# There is no missing Data.
iris.groupby('Species').count()
#using countplot to see number of each kind of iris species flower
sns.countplot(x=iris['Species'])
iris['PetalLengthCm'].hist(bins=20)
iris.drop('Id',axis=1).hist(bins=20,figsize=(10,10))
iris.groupby('Species').PetalLengthCm.plot.hist(alpha=0.4)
plt.xlabel('PetalLengthCm')
plt.suptitle('Histogram of PetalLengthCm for different Species')
plt.legend(loc=(0.7,0.79)) # this is the key of the graph, loc is known as handle
plt.grid()
iris.groupby('Species').PetalWidthCm.plot.hist(alpha=0.4)
plt.xlabel('PetalWidthCm')
plt.suptitle('Histogram of PetalWidthCm for different Species')
plt.legend(loc=(0.69,0.75))
plt.grid()
#Histogram using Matplotlib,Pandas and Seaborn.
plt.hist(data=iris,x='PetalLengthCm',bins=20)#Matplotlib
iris['PetalLengthCm'].hist(bins=20)# Pandas
sns.distplot(iris['PetalLengthCm'],bins=20) #Seaborn
#to see the relationship between 2 variables.
sns.set_style('darkgrid')
sns.scatterplot(data=iris,x='PetalLengthCm',y='PetalWidthCm',hue='Species')
sns.scatterplot(data=iris,x='PetalLengthCm',y='SepalLengthCm',hue='Species')
#Since this data is not that big and has only 4 features, we can use pairplot
#and check the relationshipo between 2 variables grouped by Species column.
sns.pairplot(data=iris.drop('Id',axis=1),hue='Species')
# Further to check the Correlation among these variables, we can use corr() function and use
# Heatmap to visualize the correlation.
iris.drop('Id',axis=1).corr()
# We have noticed from the above correlation that Petal Length and Petal Width are most correlated features.
# Also, there is good correlation between petal Length and SepalLength.
#Plotting the correlation using HeatMap
sns.heatmap(iris.drop('Id',axis=1).corr(),cmap='viridis',annot=True,)
plt.suptitle('Heatmap')
sns.boxplot(data=iris,y='SepalLengthCm')
sns.boxplot(data=iris,x='Species',y='SepalLengthCm')
sns.violinplot(data=iris,y='SepalWidthCm')
sns.violinplot(data=iris,x='Species',y='SepalLengthCm')
#Subplot
fig,axes = plt.subplots(ncols=2,nrows=1)# Creating the grid
axes[0].hist(iris['PetalLengthCm'])# Plotting on each axis
axes[0].set_title('PetalLengthCm')# Setting the Title
axes[1].scatter(iris['PetalLengthCm'],iris['PetalWidthCm'])# Plotting on each axis
axes[1].set_title('ScatterPlot')# Setting the Title
#Facet
g=sns.FacetGrid(data=iris,col='Species')# this creates the blank grid based on level of categorical variable.
g.map(plt.hist,'PetalLengthCm')# plotting using the grid created.
# grid can also be created using 2 Categorical Variables and span over rows.
h=sns.FacetGrid(data=iris,col='Species')
h.map(plt.scatter,'PetalLengthCm','PetalWidthCm',color='r')# for bivariate(2 variable) plotting
Program Null Remove (2-11-14-23)
import pandas as pd
"""" comment
What is a DataFrame?
A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.
"""" comment
emp = pd.read_csv("employees.csv")
emp.isnull().sum()
emp[emp["MANAGER_ID"].isnull()]
emp.dropna()
Categorical To Numerical (3-12-15-24)
import pandas as pd
iris = pd.read_csv("iris.csv")
"""
Need to convert categorical data into numerical data
if we need to develop any algorithm or do any analysis from the perspective of data modeling
as we save space and processing time
"""
iris.head()
iris.tail()
#create a new variable varietyCode
iris["varietyCode"] = pd.factorize(iris.variety)[0]
iris.head()
iris.tail()
#to show the varieties and count of varieties
iris.variety.value_counts()
#to show the value ofthe code assigned
iris.varietyCode.value_counts()
# method II
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
iris["Code_le"] = le.fit_transform(iris.variety)
iris.head()
Simple Linear Regression (4-16-25)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
HouseDF = pd.read_csv('USA_Housing.csv')
HouseDF.head()
HouseDF.info()
HouseDF.describe()
HouseDF.columns
sns.pairplot(HouseDF)
sns.histplot(HouseDF['Price'])
sns.heatmap(HouseDF.corr(), annot= True)
plt.show()
X = HouseDF[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
'Avg. Area Number of Bedrooms', 'Area Population']]
y= HouseDF['Price']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=101)
X_train
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df
predictions = lm.predict(X_test)
plt.scatter(y_test, predictions)
plt.show()
sns.histplot((y_test-predictions),bins=50)
plt.show()
Multiple Regression (5-17-26)
#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Reading the dataset
dataset = pd.read_csv("advertising.csv")
dataset.head()
dataset.shape
#Splitting the dataset
#from sklearn.model_selection import train_test_split
#x_train, x_test, y_train,y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)
dataset.isna().sum()
dataset.duplicated().any()
fig, axs = plt.subplots(3, figsize = (5,5))
plt1 = sns.boxplot(dataset['TV'], ax = axs[0])
plt2 = sns.boxplot(dataset['Newspaper'], ax = axs[1])
plt3 = sns.boxplot(dataset['Radio'], ax = axs[2])
plt.tight_layout()
sns.histplot(dataset['Sales']);
sns.pairplot(dataset, x_vars=['TV', 'Radio', 'Newspaper'], y_vars='Sales', height=4, aspect=1, kind='scatter')
plt.show()
sns.heatmap(dataset.corr(), annot = True)
plt.show()
Polynomial Regression Demo (6-18-27)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore') #this will ignore the warnings. It won't display warnings in notebook
# We can see that the dataset has 10 levels and the corresponding salary paid to the employee
dataset = pd.read_csv("Position_Salaries.csv")
dataset
# For the features we are selecting all the rows of column Level
# represented by column position 1 or -1 in the data set.
X=dataset.iloc[:,1:2].values
# for the target we are selecting only the salary column which
#can be selected using -1 or 2 as the column location in the dataset
y=dataset.iloc[:,2].values
X
#Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(X,y)
#Polynomial Regession
#with degree=2
from sklearn.preprocessing import PolynomialFeatures
poly_reg2=PolynomialFeatures(degree=2)
X_poly=poly_reg2.fit_transform(X)
lin_reg_2=LinearRegression()
lin_reg_2.fit(X_poly,y)
#with degree=3
poly_reg3=PolynomialFeatures(degree=3)
X_poly3=poly_reg3.fit_transform(X)
lin_reg_3=LinearRegression()
lin_reg_3.fit(X_poly3,y)
#Visualizing Linear Regression result
plt.scatter(X,y,color='red')
plt.plot(X,lin_reg.predict(X),color='blue')
plt.title('Truth Or Bluff (Linear Regression)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
#to avoid the result in 1e6 raise to form of y-axis
#plt.ticklabel_format(useOffset=False, style='plain')
plt.show()
#Visualizing Plolynomial Linear Regression result
plt.scatter(X,y,color='red')
plt.plot(X,lin_reg_2.predict(poly_reg2.fit_transform(X)),color='blue')
plt.plot(X,lin_reg_3.predict(poly_reg3.fit_transform(X)),color='green')
plt.title('Truth Or Bluff (Polynomial Linear Regression)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()
#Smoothing out the curve using more points on X axis
X_grid=np.arange(min(X),max(X),0.1) # This will give us a vector.We will have to convert this into a matrix
X_grid=X_grid.reshape((len(X_grid),1))
plt.scatter(X,y,color='red')
plt.plot(X_grid,lin_reg_3.predict(poly_reg3.fit_transform(X_grid)),color='blue')
#plt.plot(X,lin_reg_3.predict(poly_reg3.fit_transform(X)),color='green')
plt.title('Truth Or Bluff (Polynomial Linear Regression)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()
#Predicting the salary of the employee
lin_reg.predict([[6.5]]) # We are assuming the level of the employee is 6.5
lin_reg_2.predict(poly_reg2.fit_transform([[6.5]]))
lin_reg_3.predict(poly_reg3.fit_transform([[6.5]]))
Naive Bayes (7-19-28)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for statistical data visualization
import warnings
warnings.filterwarnings('ignore')
data = 'adult.csv'
df = pd.read_csv(data, header=None, sep=',\s')
df.shape
df.head(5)
col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',
'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
df.columns = col_names
df.columns
df.head()
df.info()
# find categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)
# view the categorical variables
df[categorical].head()
# check missing values in categorical variables
df[categorical].isnull().sum()
# view frequency counts of values in categorical variables
for var in categorical:
print(df[var].value_counts())
# check labels in workclass variable
df.workclass.unique()
# check frequency distribution of values in workclass variable
df.workclass.value_counts()
# view frequency distribution of categorical variables
for var in categorical:
print(df[var].value_counts()/np.float(len(df)))
# check labels in workclass variable
df.workclass.unique()
# check frequency distribution of values in workclass variable
df.workclass.value_counts()
# replace '?' values in workclass variable with `NaN`
df['workclass'].replace('?', np.NaN, inplace=True)
# again check the frequency distribution of values in workclass variable
df.workclass.value_counts()
# check labels in occupation variable
df.occupation.unique()
# check frequency distribution of values in occupation variable
df.occupation.value_counts()
# replace '?' values in occupation variable with `NaN`
df['occupation'].replace('?', np.NaN, inplace=True)
# again check the frequency distribution of values in occupation variable
df.occupation.value_counts()
# check labels in native_country variable
df.native_country.unique()
# check frequency distribution of values in native_country variable
df.native_country.value_counts()
# replace '?' values in native_country variable with `NaN`
df['native_country'].replace('?', np.NaN, inplace=True)
# again check the frequency distribution of values in native_country variable
df.native_country.value_counts()
df[categorical].isnull().sum()
# check for cardinality in categorical variables
for var in categorical:
print(var, ' contains ', len(df[var].unique()), ' labels')
# find numerical variables
numerical = [var for var in df.columns if df[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :', numerical)
# view the numerical variables
df[numerical].head()
# check missing values in numerical variables
df[numerical].isnull().sum()
# 8. Declare feature vector and target variable
X = df.drop(['income'], axis=1)
y = df['income']
# 9. Split data into separate training and test set
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# check the shape of X_train and X_test
X_train.shape, X_test.shape
# check data types in X_train
X_train.dtypes
# display categorical variables
categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']
categorical
# display numerical variables
numerical = [col for col in X_train.columns if X_train[col].dtypes != 'O']
numerical
# print percentage of missing values in the categorical variables in training set
X_train[categorical].isnull().mean()
# print categorical variables with missing data
for col in categorical:
if X_train[col].isnull().mean()>0:
print(col, (X_train[col].isnull().mean()))
# impute missing categorical variables with most frequent value
for df2 in [X_train, X_test]:
df2['workclass'].fillna(X_train['workclass'].mode()[0], inplace=True)
df2['occupation'].fillna(X_train['occupation'].mode()[0], inplace=True)
df2['native_country'].fillna(X_train['native_country'].mode()[0], inplace=True)
# check missing values in categorical variables in X_train
X_train[categorical].isnull().sum()
# check missing values in categorical variables in X_test
X_test[categorical].isnull().sum()
# check missing values in X_train
X_train.isnull().sum()
# check missing values in X_test
X_test.isnull().sum()
# print categorical variables
categorical
X_train[categorical].head()
# import category encoders
# if this module not found go tothe terminal and enter the command: conda install -c conda-forge category_encoders
import category_encoders as ce
# encode remaining variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital_status', 'occupation', 'relationship',
'race', 'sex', 'native_country'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
X_train.head()
X_train.shape
X_test.head()
X_test.shape
# Feature scaling
cols = X_train.columns
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])
X_train.head()
# We now have X_train dataset ready to be fed into the Gaussian Naive Bayes classifier. I will do it as follows.
# 12. Model training
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB
# instantiate the model
gnb = GaussianNB()
# fit the model
gnb.fit(X_train, y_train)
# 13. Predict the results
y_pred = gnb.predict(X_test)
y_pred
# 14. Check accuracy score
from sklearn.metrics import accuracy_score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
# Compare the train-set and test-set accuracy
# Now, we will compare the train-set and test-set accuracy to check for overfitting.
y_pred_train = gnb.predict(X_train)
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
# Check for overfitting and underfitting
# print the scores on training and test set
print('Training set score: {:.4f}'.format(gnb.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(gnb.score(X_test, y_test)))
# check class distribution in test set
y_test.value_counts()
# check null accuracy score
null_accuracy = (7407/(7407+2362))
print('Null accuracy score: {0:0.4f}'. format(null_accuracy))
# 15. Confusion matrix
# Print the Confusion Matrix and slice it into four pieces
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])
# visualize confusion matrix with seaborn heatmap
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
# 16. Classification metrices
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]
# print classification accuracy
classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)
print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))
# print classification error
classification_error = (FP + FN) / float(TP + TN + FP + FN)
print('Classification error : {0:0.4f}'.format(classification_error))
# print precision score
precision = TP / float(TP + FP)
print('Precision : {0:0.4f}'.format(precision))
recall = TP / float(TP + FN)
print('Recall or Sensitivity : {0:0.4f}'.format(recall))
true_positive_rate = TP / float(TP + FN)
print('True Positive Rate : {0:0.4f}'.format(true_positive_rate))
false_positive_rate = FP / float(FP + TN)
print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))
specificity = TN / (TN + FP)
print('Specificity : {0:0.4f}'.format(specificity))
# print the first 10 predicted probabilities of two classes- 0 and 1
y_pred_prob = gnb.predict_proba(X_test)[0:10]
y_pred_prob
# store the probabilities in dataframe
y_pred_prob_df = pd.DataFrame(data=y_pred_prob, columns=['Prob of - <=50K', 'Prob of - >50K'])
y_pred_prob_df
# print the first 10 predicted probabilities for class 1 - Probability of >50K
gnb.predict_proba(X_test)[0:10, 1]
# store the predicted probabilities for class 1 - Probability of >50K
y_pred1 = gnb.predict_proba(X_test)[:, 1]
# plot histogram of predicted probabilities
# adjust the font size
plt.rcParams['font.size'] = 12
# plot histogram with 10 bins
plt.hist(y_pred1, bins = 10)
# set the title of predicted probabilities
plt.title('Histogram of predicted probabilities of salaries >50K')
# set the x-axis limit
plt.xlim(0,1)
# set the title
plt.xlabel('Predicted probabilities of salaries >50K')
plt.ylabel('Frequency')
# plot ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred1, pos_label = '>50K')
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for Gaussian Naive Bayes Classifier for Predicting Salaries')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()
# compute ROC AUC
from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test, y_pred1)
print('ROC AUC : {:.4f}'.format(ROC_AUC))
# calculate cross-validated ROC AUC
from sklearn.model_selection import cross_val_score
Cross_validated_ROC_AUC = cross_val_score(gnb, X_train, y_train, cv=5, scoring='roc_auc').mean()
print('Cross validated ROC AUC : {:.4f}'.format(Cross_validated_ROC_AUC))
# Applying 10-Fold Cross Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(gnb, X_train, y_train, cv = 10, scoring='accuracy')
print('Cross-validation scores:{}'.format(scores))
# compute Average cross-validation score
print('Average cross-validation score: {:.4f}'.format(scores.mean()))
Naive Bayes 2 (7-19-28)
# importing the libraries
Import jumpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# Importing the dataset
dataset = pd.read_csv('User_Data.csv')
x = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(x_test)
y_pred
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm
# Visualising the Training set results
from matplotlib.colors import ListedColormap
x_set, y_set = x_train, y_train
X1, X2 = nm.meshgrid(nm.arange(start = x_set[:, 0].min() - 1, stop = x_set[:, 0].max() + 1, step = 0.01),
nm.arange(start = x_set[:, 1].min() - 1, stop = x_set[:, 1].max() + 1, step = 0.01))
mtp.contourf(X1, X2, classifier.predict(nm.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('lightblue', 'lightgreen')))
mtp.xlim(X1.min(), X1.max())
mtp.ylim(X2.min(), X2.max())
for i, j in enumerate(nm.unique(y_set)):
mtp.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c = ListedColormap(('purple', 'green'))(i), label = j)
mtp.title('Naive Bayes (Training set)')
mtp.xlabel('Age')
mtp.ylabel('Estimated Salary')
mtp.legend()
mtp.show()
# Visualising the Test set results
from matplotlib.colors import ListedColormap
x_set, y_set = x_test, y_test
X1, X2 = nm.meshgrid(nm.arange(start = x_set[:, 0].min() - 1, stop = x_set[:, 0].max() + 1, step = 0.01),
nm.arange(start = x_set[:, 1].min() - 1, stop = x_set[:, 1].max() + 1, step = 0.01))
mtp.contourf(X1, X2, classifier.predict(nm.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('lightblue', 'lightgreen')))
mtp.xlim(X1.min(), X1.max())
mtp.ylim(X2.min(), X2.max())
for i, j in enumerate(nm.unique(y_set)):
mtp.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c = ListedColormap(('purple', 'green'))(i), label = j)
mtp.title('Naive Bayes (test set)')
mtp.xlabel('Age')
mtp.ylabel('Estimated Salary')
mtp.legend()
mtp.show()
Decision Tree (8-20-29)
# Load libraries
import numpy as np
import pandas as pd
from sklearn import metrics
# Importing the Dataset
df=pd.read_csv("PlayTennis.csv")
value=['Outlook','Temprature','Humidity','Wind']
df
# Data Analysis
len(df) #Dataset Length
df.shape #To see the number of rows and columns in our dataset
df.head() #To inspect the first five records of the dataset
df.tail() #To inspect the last five records of the dataset
df.describe() #To see statistical details of the dataset
# Preparing the Data (Data Slicing)
#machine learning algorithms can only learn from numbers (int, float, doubles .. )
#so let us encode it to int
from sklearn import preprocessing
string_to_int= preprocessing.LabelEncoder() #encode your data
df=df.apply(string_to_int.fit_transform) #fit and transform it
df
#To divide our data into attribute set and Label:
feature_cols = ['Outlook','Temprature','Humidity','Wind']
X = df[feature_cols] #contains the attribute
y = df.Play_Tennis #contains the label
#To divide our data into training and test sets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
# perform training
# import the classifier
from sklearn.tree import DecisionTreeClassifier
# create a classifier object
classifier =DecisionTreeClassifier(criterion="entropy", random_state=100)
# fit the classifier with X and Y data or
classifier.fit(X_train, y_train)
#Predict the response for test dataset
y_pred= classifier.predict(X_test)
# Model Accuracy, how often is the classifier correct?
from sklearn.metrics import accuracy_score
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Now let's compare some of our predicted values with the actual values and see how accurate we were:
data_p=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
data_p
# Evaluating the Algorithm
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Visualizing Decision Trees
from sklearn.tree import export_graphviz
from six import StringIO
# from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
# if this modeule is not found then go to the terminal and enter the command conda install -c conda-forge pydotplus
dot_data = StringIO().getvalue(self)
export_graphviz(classifier, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names=value, class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('PlayTennis.png')
Image(graph.create_png())
Linear SVM (9-21-30)
# Import the Libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
# Import some Data from the iris Data Set
iris = datasets.load_iris()
# Take only the first two features of Data.
# To avoid the slicing, Two-Dim Dataset can be used
X = iris.data[:, :2]
y = iris.target
# C is the SVM regularization parameter
C = 1.0
# Create an Instance of SVM and Fit out the data.
# Data is not scaled so as to be able to plot the support vectors
svc = svm.SVC(kernel ='linear', C = 1).fit(X, y)
# create a mesh to plot
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
h = (x_max / x_min)/100
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Plot the data for Proper Visual Representation
plt.subplot(1, 1, 1)
# Predict the result by giving Data to the model
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap = plt.cm.Paired, alpha = 0.2)
plt.scatter(X[:, 0], X[:, 1], c = y, cmap = plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.title('SVC with linear kernel')
# Output the Plot
plt.show()
