Added the Decision tree classifier (#224)

* Random Forest Classifier implemented * removed the .ipynb_checkpoints in machine-learning/iris * Removed the .ipynb_checkpoints in machine_learning/iris/rfc * Decision Tree Classifier implemented * removed the directory md that was in the repo but not locally
WebClub-NITK · Oct 20, 2018 · 1ad1e90 · 1ad1e90
1 parent 042120c
commit 1ad1e90
Show file tree

Hide file tree

Showing 2 changed files with 216 additions and 0 deletions.
diff --git a/machine_learning/iris/dtc/README.md b/machine_learning/iris/dtc/README.md
@@ -0,0 +1 @@
+Decision Tree classifier implemented 
diff --git a/machine_learning/iris/dtc/decision_tree_implementation.ipynb b/machine_learning/iris/dtc/decision_tree_implementation.ipynb
@@ -0,0 +1,215 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.datasets import load_iris\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "from sklearn.tree import DecisionTreeClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Import dataset\n",
+    "dataset_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\"\n",
+    "column_names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']\n",
+    "dataset = pd.read_csv(dataset_url, names=column_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(150, 5)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Dimensions of dataset\n",
+    "print(dataset.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     sepal-length  sepal-width  petal-length  petal-width            class\n",
+      "30            4.8          3.1           1.6          0.2      Iris-setosa\n",
+      "82            5.8          2.7           3.9          1.2  Iris-versicolor\n",
+      "108           6.7          2.5           5.8          1.8   Iris-virginica\n",
+      "32            5.2          4.1           1.5          0.1      Iris-setosa\n",
+      "90            5.5          2.6           4.4          1.2  Iris-versicolor\n",
+      "55            5.7          2.8           4.5          1.3  Iris-versicolor\n",
+      "31            5.4          3.4           1.5          0.4      Iris-setosa\n",
+      "130           7.4          2.8           6.1          1.9   Iris-virginica\n",
+      "6             4.6          3.4           1.4          0.3      Iris-setosa\n",
+      "48            5.3          3.7           1.5          0.2      Iris-setosa\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 10 random rows of the dataset\n",
+    "print(dataset.sample(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "       sepal-length  sepal-width  petal-length  petal-width\n",
+      "count    150.000000   150.000000    150.000000   150.000000\n",
+      "mean       5.843333     3.054000      3.758667     1.198667\n",
+      "std        0.828066     0.433594      1.764420     0.763161\n",
+      "min        4.300000     2.000000      1.000000     0.100000\n",
+      "25%        5.100000     2.800000      1.600000     0.300000\n",
+      "50%        5.800000     3.000000      4.350000     1.300000\n",
+      "75%        6.400000     3.300000      5.100000     1.800000\n",
+      "max        7.900000     4.400000      6.900000     2.500000\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Description of dataset\n",
+    "print(dataset.describe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "X = dataset.iloc[:,0:4] \n",
+    "y = dataset['class']\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# 1. Train the model on training data(X_train, y_train)\n",
+    "dtc = DecisionTreeClassifier(random_state=0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
+       "            max_features=None, max_leaf_nodes=None,\n",
+       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
+       "            min_samples_leaf=1, min_samples_split=2,\n",
+       "            min_weight_fraction_leaf=0.0, presort=False, random_state=0,\n",
+       "            splitter='best')"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 2. Predict target values using test data(X_test)\n",
+    "dtc.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.01256535  0.02915555  0.05981177  0.89846733]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dtc.feature_importances_)\n",
+    "#Predict target values using test data(X_test)\n",
+    "predictions = dtc.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy Score is :0.96\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 3. Find accuracy of the model comparing with actual test data targets(y_test)\n",
+    "score = dtc.score(X_test,y_test)\n",
+    "print(\"Accuracy Score is :\" + str(score))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}