Skip to content
Permalink
bd208cefbd
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
1583 lines (1583 sloc) 65 KB
{
"cells": [
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"'''numpy'''\n",
"import numpy as np\n",
"\n",
"'''pandas'''\n",
"import pandas as pd \n",
"\n",
"'''time'''\n",
"import time\n",
"\n",
"'''seaborn'''\n",
"import seaborn as sn\n",
"\n",
"'''sklearn'''\n",
"from sklearn import neighbors, metrics \n",
"from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold\n",
"from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"from sklearn.impute import SimpleImputer\n",
"\n",
"'''matplotlib'''\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Report ID Arrest Date Time Area ID Area Name \\\n",
"0 5666847 2019-06-22T00:00:00.000 1630.0 14 Pacific \n",
"1 5666688 2019-06-22T00:00:00.000 1010.0 10 West Valley \n",
"2 5666570 2019-06-22T00:00:00.000 400.0 15 N Hollywood \n",
"3 5666529 2019-06-22T00:00:00.000 302.0 17 Devonshire \n",
"4 5666742 2019-06-22T00:00:00.000 1240.0 14 Pacific \n",
"... ... ... ... ... ... \n",
"1276155 100504416 2010-01-01T00:00:00.000 1430.0 5 Harbor \n",
"1276156 101104731 2010-01-01T00:00:00.000 2215.0 11 Northeast \n",
"1276157 101104211 2010-01-01T00:00:00.000 1310.0 11 Northeast \n",
"1276158 2179817 2010-01-01T00:00:00.000 319.0 14 Pacific \n",
"1276159 2180332 2010-01-01T00:00:00.000 1815.0 19 Mission \n",
"\n",
" Reporting District Age Sex Code Descent Code Charge Group Code \\\n",
"0 1457 44 M W 24.0 \n",
"1 1061 8 M O NaN \n",
"2 1543 31 F O 22.0 \n",
"3 1738 23 F W 22.0 \n",
"4 1472 28 M W 8.0 \n",
"... ... ... ... ... ... \n",
"1276155 521 17 M H 24.0 \n",
"1276156 1118 12 M H 24.0 \n",
"1276157 1128 52 M H 18.0 \n",
"1276158 1408 24 M H 22.0 \n",
"1276159 1994 25 M W 16.0 \n",
"\n",
" ... Charge Description \\\n",
"0 ... VANDALISM \n",
"1 ... NaN \n",
"2 ... DRUNK DRIVING ALCOHOL/DRUGS \n",
"3 ... DRUNK DRIVING ALCOHOL/DRUGS \n",
"4 ... OBSTRUCT/RESIST EXECUTIVE OFFICER \n",
"... ... ... \n",
"1276155 ... MINOR BUY/ETC TOBACCO/ETC \n",
"1276156 ... CURFEW - JUV ONLY \n",
"1276157 ... DRINKING IN PUBLIC \n",
"1276158 ... DRUNK DRIVING ALCOHOL/DRUGS \n",
"1276159 ... POSSESSION CONTROLLED SUBSTANCE \n",
"\n",
" Address \\\n",
"0 12300 CULVER BL \n",
"1 19000 VANOWEN ST \n",
"2 MAGNOLIA AV \n",
"3 HAYVENHURST ST \n",
"4 6600 ESPLANADE ST \n",
"... ... \n",
"1276155 4TH \n",
"1276156 AVENUE 58 \n",
"1276157 YORK BL \n",
"1276158 NATIONAL BL \n",
"1276159 ROSCOE \n",
"\n",
" Cross Street \\\n",
"0 NaN \n",
"1 NaN \n",
"2 LAUREL CANYON BL \n",
"3 N REGAN FY \n",
"4 NaN \n",
"... ... \n",
"1276155 GAFFEY \n",
"1276156 FIGUEROA ST \n",
"1276157 AVENUE 63 \n",
"1276158 MANNING AV \n",
"1276159 WILLIS \n",
"\n",
" Location Zip Codes \\\n",
"0 {'latitude': '33.992', 'human_address': '{\"add... 24031.0 \n",
"1 {'latitude': '34.1687', 'human_address': '{\"ad... 19339.0 \n",
"2 {'latitude': '34.1649', 'human_address': '{\"ad... 8890.0 \n",
"3 {'latitude': '34.2692', 'human_address': '{\"ad... 19329.0 \n",
"4 {'latitude': '33.9609', 'human_address': '{\"ad... 25075.0 \n",
"... ... ... \n",
"1276155 {'latitude': '33.7406', 'human_address': '{\"ad... 3342.0 \n",
"1276156 {'latitude': '34.1101', 'human_address': '{\"ad... 23673.0 \n",
"1276157 {'latitude': '34.1148', 'human_address': '{\"ad... 23673.0 \n",
"1276158 {'latitude': '34.0301', 'human_address': '{\"ad... 23451.0 \n",
"1276159 {'latitude': '34.2215', 'human_address': '{\"ad... 19730.0 \n",
"\n",
" Census Tracts Precinct Boundaries LA Specific Plans \\\n",
"0 918.0 1137.0 10.0 \n",
"1 321.0 1494.0 NaN \n",
"2 205.0 1332.0 17.0 \n",
"3 69.0 388.0 NaN \n",
"4 937.0 241.0 10.0 \n",
"... ... ... ... \n",
"1276155 975.0 1205.0 NaN \n",
"1276156 370.0 477.0 28.0 \n",
"1276157 359.0 575.0 NaN \n",
"1276158 872.0 1124.0 9.0 \n",
"1276159 147.0 418.0 NaN \n",
"\n",
" Council Districts Neighborhood Councils (Certified) \n",
"0 10.0 85.0 \n",
"1 4.0 10.0 \n",
"2 5.0 39.0 \n",
"3 2.0 78.0 \n",
"4 10.0 16.0 \n",
"... ... ... \n",
"1276155 15.0 36.0 \n",
"1276156 11.0 93.0 \n",
"1276157 9.0 93.0 \n",
"1276158 6.0 75.0 \n",
"1276159 3.0 59.0 \n",
"\n",
"[1276160 rows x 23 columns]\n"
]
}
],
"source": [
"'''Read and display data'''\n",
"data = pd.read_csv(\"Datasets/LA Crime Data/arrest-data-from-2010-to-present.csv\") #Load in the arrest data\n",
"print(data) #Display the data"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [],
"source": [
"'''Assigning attributes to use'''\n",
"data = data.head(n=100000)\n",
"\n",
"selectedData = data[[ #Here I am assigning all the values to see if any make a signifciant difference\n",
" 'Sex Code',\n",
" 'Descent Code',\n",
" 'Charge Description'\n",
"]].values"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Sex Code Decent Code Charge Description\n",
"0 M W VANDALISM\n",
"2 F O DRUNK DRIVING ALCOHOL/DRUGS\n",
"3 F W DRUNK DRIVING ALCOHOL/DRUGS\n",
"4 M W OBSTRUCT/RESIST EXECUTIVE OFFICER\n",
"5 M H PARENT IN CUSTODY, NO CARETAKER AVAILABLE\n",
".. ... ... ...\n",
"994 M H PAROLE WARRANT\n",
"995 M H WARRANT-PETITION TO REVOKE COMMUNITY SUPVN\n",
"996 M B DRINKING IN PUBLIC\n",
"998 M H BURGLARY\n",
"999 F H CORPORAL INJURY ON SPOUSE/COHABITANT/ETC\n",
"\n",
"[938 rows x 3 columns]\n"
]
}
],
"source": [
"selectedData = pd.DataFrame(selectedData, columns = ['Sex Code','Decent Code','Charge Description'])\n",
"selectedData = selectedData.dropna()\n",
"print(selectedData)\n"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(938, 1) (938, 1)\n"
]
}
],
"source": [
"X = selectedData[[ #Here I am assigning all the values to see if any make a signifciant difference\n",
" #'Sex Code',\n",
" 'Charge Description'\n",
"]].values\n",
"\n",
"\n",
"#Set the class\n",
"y = selectedData[[\n",
" 'Decent Code'\n",
"]]\n",
"\n",
"#X = selectedData.to_numpy()\n",
"\n",
"print(X.shape, y.shape)\n",
"\n",
"y = np.ravel(y)"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'A', 'B', 'F', 'O', 'W', 'H'}\n",
"[[186]\n",
" [56]\n",
" [56]\n",
" [115]\n",
" [122]\n",
" [135]\n",
" [56]\n",
" [111]\n",
" [56]\n",
" [112]\n",
" [49]\n",
" [56]\n",
" [149]\n",
" [143]\n",
" [4]\n",
" [182]\n",
" [103]\n",
" [77]\n",
" [39]\n",
" [146]\n",
" [56]\n",
" [56]\n",
" [56]\n",
" [182]\n",
" [122]\n",
" [143]\n",
" [48]\n",
" [165]\n",
" [20]\n",
" [183]\n",
" [56]\n",
" [142]\n",
" [175]\n",
" [165]\n",
" [58]\n",
" [140]\n",
" [56]\n",
" [185]\n",
" [103]\n",
" [56]\n",
" [56]\n",
" [169]\n",
" [59]\n",
" [132]\n",
" [56]\n",
" [59]\n",
" [151]\n",
" [165]\n",
" [139]\n",
" [59]\n",
" [95]\n",
" [84]\n",
" [192]\n",
" [165]\n",
" [120]\n",
" [20]\n",
" [151]\n",
" [48]\n",
" [56]\n",
" [91]\n",
" [56]\n",
" [165]\n",
" [105]\n",
" [59]\n",
" [143]\n",
" [151]\n",
" [56]\n",
" [142]\n",
" [55]\n",
" [163]\n",
" [143]\n",
" [143]\n",
" [37]\n",
" [151]\n",
" [80]\n",
" [17]\n",
" [59]\n",
" [165]\n",
" [123]\n",
" [119]\n",
" [189]\n",
" [51]\n",
" [102]\n",
" [165]\n",
" [56]\n",
" [70]\n",
" [59]\n",
" [56]\n",
" [123]\n",
" [18]\n",
" [20]\n",
" [20]\n",
" [16]\n",
" [56]\n",
" [143]\n",
" [158]\n",
" [142]\n",
" [103]\n",
" [56]\n",
" [56]\n",
" [17]\n",
" [59]\n",
" [50]\n",
" [158]\n",
" [18]\n",
" [59]\n",
" [56]\n",
" [107]\n",
" [159]\n",
" [35]\n",
" [183]\n",
" [34]\n",
" [143]\n",
" [59]\n",
" [143]\n",
" [58]\n",
" [17]\n",
" [105]\n",
" [171]\n",
" [58]\n",
" [20]\n",
" [143]\n",
" [189]\n",
" [56]\n",
" [185]\n",
" [143]\n",
" [56]\n",
" [7]\n",
" [91]\n",
" [165]\n",
" [56]\n",
" [56]\n",
" [161]\n",
" [56]\n",
" [143]\n",
" [21]\n",
" [139]\n",
" [56]\n",
" [151]\n",
" [78]\n",
" [143]\n",
" [59]\n",
" [188]\n",
" [56]\n",
" [147]\n",
" [56]\n",
" [139]\n",
" [165]\n",
" [48]\n",
" [84]\n",
" [56]\n",
" [56]\n",
" [151]\n",
" [56]\n",
" [139]\n",
" [165]\n",
" [55]\n",
" [123]\n",
" [58]\n",
" [56]\n",
" [151]\n",
" [140]\n",
" [59]\n",
" [17]\n",
" [143]\n",
" [175]\n",
" [62]\n",
" [98]\n",
" [56]\n",
" [132]\n",
" [131]\n",
" [139]\n",
" [111]\n",
" [146]\n",
" [36]\n",
" [136]\n",
" [192]\n",
" [178]\n",
" [15]\n",
" [143]\n",
" [151]\n",
" [192]\n",
" [137]\n",
" [101]\n",
" [117]\n",
" [151]\n",
" [142]\n",
" [151]\n",
" [167]\n",
" [25]\n",
" [114]\n",
" [59]\n",
" [114]\n",
" [131]\n",
" [56]\n",
" [77]\n",
" [166]\n",
" [13]\n",
" [180]\n",
" [191]\n",
" [17]\n",
" [20]\n",
" [151]\n",
" [146]\n",
" [126]\n",
" [154]\n",
" [46]\n",
" [56]\n",
" [34]\n",
" [36]\n",
" [50]\n",
" [146]\n",
" [169]\n",
" [94]\n",
" [186]\n",
" [45]\n",
" [154]\n",
" [44]\n",
" [130]\n",
" [55]\n",
" [103]\n",
" [123]\n",
" [116]\n",
" [56]\n",
" [55]\n",
" [13]\n",
" [34]\n",
" [6]\n",
" [159]\n",
" [143]\n",
" [6]\n",
" [6]\n",
" [138]\n",
" [178]\n",
" [142]\n",
" [143]\n",
" [192]\n",
" [86]\n",
" [144]\n",
" [37]\n",
" [58]\n",
" [6]\n",
" [42]\n",
" [159]\n",
" [74]\n",
" [191]\n",
" [25]\n",
" [84]\n",
" [142]\n",
" [50]\n",
" [143]\n",
" [56]\n",
" [126]\n",
" [28]\n",
" [84]\n",
" [101]\n",
" [141]\n",
" [71]\n",
" [126]\n",
" [5]\n",
" [165]\n",
" [91]\n",
" [28]\n",
" [45]\n",
" [17]\n",
" [143]\n",
" [182]\n",
" [192]\n",
" [65]\n",
" [143]\n",
" [35]\n",
" [56]\n",
" [56]\n",
" [178]\n",
" [131]\n",
" [169]\n",
" [114]\n",
" [55]\n",
" [178]\n",
" [30]\n",
" [105]\n",
" [152]\n",
" [131]\n",
" [17]\n",
" [126]\n",
" [43]\n",
" [10]\n",
" [192]\n",
" [132]\n",
" [56]\n",
" [143]\n",
" [6]\n",
" [61]\n",
" [175]\n",
" [123]\n",
" [51]\n",
" [84]\n",
" [73]\n",
" [86]\n",
" [192]\n",
" [18]\n",
" [123]\n",
" [36]\n",
" [178]\n",
" [186]\n",
" [191]\n",
" [186]\n",
" [142]\n",
" [123]\n",
" [100]\n",
" [159]\n",
" [123]\n",
" [56]\n",
" [178]\n",
" [124]\n",
" [94]\n",
" [34]\n",
" [186]\n",
" [70]\n",
" [191]\n",
" [34]\n",
" [20]\n",
" [191]\n",
" [156]\n",
" [192]\n",
" [165]\n",
" [23]\n",
" [84]\n",
" [96]\n",
" [20]\n",
" [12]\n",
" [141]\n",
" [165]\n",
" [192]\n",
" [55]\n",
" [103]\n",
" [130]\n",
" [131]\n",
" [165]\n",
" [178]\n",
" [169]\n",
" [20]\n",
" [77]\n",
" [191]\n",
" [139]\n",
" [51]\n",
" [186]\n",
" [188]\n",
" [34]\n",
" [165]\n",
" [123]\n",
" [25]\n",
" [149]\n",
" [89]\n",
" [114]\n",
" [45]\n",
" [46]\n",
" [164]\n",
" [171]\n",
" [143]\n",
" [56]\n",
" [59]\n",
" [10]\n",
" [2]\n",
" [18]\n",
" [143]\n",
" [11]\n",
" [56]\n",
" [103]\n",
" [143]\n",
" [58]\n",
" [47]\n",
" [115]\n",
" [192]\n",
" [22]\n",
" [34]\n",
" [20]\n",
" [162]\n",
" [90]\n",
" [36]\n",
" [187]\n",
" [144]\n",
" [143]\n",
" [25]\n",
" [46]\n",
" [20]\n",
" [15]\n",
" [56]\n",
" [99]\n",
" [143]\n",
" [104]\n",
" [175]\n",
" [46]\n",
" [175]\n",
" [123]\n",
" [77]\n",
" [33]\n",
" [79]\n",
" [46]\n",
" [6]\n",
" [58]\n",
" [154]\n",
" [60]\n",
" [159]\n",
" [151]\n",
" [36]\n",
" [137]\n",
" [56]\n",
" [56]\n",
" [101]\n",
" [56]\n",
" [183]\n",
" [17]\n",
" [110]\n",
" [155]\n",
" [6]\n",
" [165]\n",
" [17]\n",
" [19]\n",
" [126]\n",
" [46]\n",
" [34]\n",
" [6]\n",
" [118]\n",
" [20]\n",
" [132]\n",
" [37]\n",
" [180]\n",
" [124]\n",
" [18]\n",
" [155]\n",
" [51]\n",
" [6]\n",
" [186]\n",
" [165]\n",
" [85]\n",
" [53]\n",
" [72]\n",
" [34]\n",
" [34]\n",
" [35]\n",
" [77]\n",
" [47]\n",
" [56]\n",
" [103]\n",
" [85]\n",
" [50]\n",
" [143]\n",
" [24]\n",
" [46]\n",
" [79]\n",
" [118]\n",
" [17]\n",
" [145]\n",
" [192]\n",
" [46]\n",
" [64]\n",
" [143]\n",
" [192]\n",
" [25]\n",
" [25]\n",
" [56]\n",
" [6]\n",
" [34]\n",
" [19]\n",
" [25]\n",
" [59]\n",
" [36]\n",
" [77]\n",
" [50]\n",
" [137]\n",
" [79]\n",
" [191]\n",
" [34]\n",
" [158]\n",
" [173]\n",
" [150]\n",
" [6]\n",
" [159]\n",
" [34]\n",
" [17]\n",
" [29]\n",
" [57]\n",
" [143]\n",
" [20]\n",
" [191]\n",
" [177]\n",
" [77]\n",
" [17]\n",
" [171]\n",
" [31]\n",
" [42]\n",
" [17]\n",
" [86]\n",
" [190]\n",
" [85]\n",
" [171]\n",
" [193]\n",
" [56]\n",
" [46]\n",
" [36]\n",
" [171]\n",
" [20]\n",
" [6]\n",
" [50]\n",
" [6]\n",
" [20]\n",
" [36]\n",
" [159]\n",
" [180]\n",
" [14]\n",
" [32]\n",
" [85]\n",
" [114]\n",
" [7]\n",
" [25]\n",
" [171]\n",
" [187]\n",
" [55]\n",
" [143]\n",
" [55]\n",
" [34]\n",
" [41]\n",
" [171]\n",
" [25]\n",
" [55]\n",
" [159]\n",
" [171]\n",
" [146]\n",
" [85]\n",
" [20]\n",
" [123]\n",
" [34]\n",
" [143]\n",
" [169]\n",
" [171]\n",
" [34]\n",
" [6]\n",
" [192]\n",
" [182]\n",
" [159]\n",
" [131]\n",
" [10]\n",
" [56]\n",
" [97]\n",
" [46]\n",
" [143]\n",
" [131]\n",
" [179]\n",
" [142]\n",
" [128]\n",
" [176]\n",
" [17]\n",
" [56]\n",
" [56]\n",
" [57]\n",
" [86]\n",
" [6]\n",
" [56]\n",
" [158]\n",
" [114]\n",
" [69]\n",
" [56]\n",
" [56]\n",
" [144]\n",
" [46]\n",
" [174]\n",
" [10]\n",
" [143]\n",
" [56]\n",
" [109]\n",
" [34]\n",
" [149]\n",
" [51]\n",
" [123]\n",
" [134]\n",
" [191]\n",
" [55]\n",
" [52]\n",
" [8]\n",
" [151]\n",
" [159]\n",
" [129]\n",
" [6]\n",
" [46]\n",
" [50]\n",
" [22]\n",
" [115]\n",
" [146]\n",
" [144]\n",
" [77]\n",
" [56]\n",
" [85]\n",
" [190]\n",
" [165]\n",
" [143]\n",
" [143]\n",
" [158]\n",
" [34]\n",
" [6]\n",
" [157]\n",
" [136]\n",
" [25]\n",
" [143]\n",
" [56]\n",
" [123]\n",
" [6]\n",
" [20]\n",
" [29]\n",
" [56]\n",
" [143]\n",
" [10]\n",
" [192]\n",
" [36]\n",
" [165]\n",
" [37]\n",
" [51]\n",
" [192]\n",
" [159]\n",
" [135]\n",
" [59]\n",
" [34]\n",
" [29]\n",
" [34]\n",
" [83]\n",
" [56]\n",
" [95]\n",
" [172]\n",
" [184]\n",
" [159]\n",
" [34]\n",
" [25]\n",
" [87]\n",
" [140]\n",
" [35]\n",
" [166]\n",
" [151]\n",
" [137]\n",
" [94]\n",
" [191]\n",
" [126]\n",
" [146]\n",
" [15]\n",
" [56]\n",
" [175]\n",
" [86]\n",
" [6]\n",
" [143]\n",
" [113]\n",
" [24]\n",
" [6]\n",
" [131]\n",
" [143]\n",
" [50]\n",
" [41]\n",
" [126]\n",
" [153]\n",
" [112]\n",
" [45]\n",
" [144]\n",
" [36]\n",
" [77]\n",
" [165]\n",
" [46]\n",
" [159]\n",
" [25]\n",
" [169]\n",
" [41]\n",
" [183]\n",
" [41]\n",
" [153]\n",
" [88]\n",
" [171]\n",
" [155]\n",
" [123]\n",
" [143]\n",
" [176]\n",
" [165]\n",
" [20]\n",
" [57]\n",
" [86]\n",
" [56]\n",
" [5]\n",
" [115]\n",
" [181]\n",
" [56]\n",
" [143]\n",
" [56]\n",
" [165]\n",
" [56]\n",
" [85]\n",
" [131]\n",
" [86]\n",
" [6]\n",
" [75]\n",
" [95]\n",
" [126]\n",
" [151]\n",
" [148]\n",
" [142]\n",
" [176]\n",
" [58]\n",
" [143]\n",
" [76]\n",
" [142]\n",
" [40]\n",
" [6]\n",
" [160]\n",
" [3]\n",
" [165]\n",
" [118]\n",
" [51]\n",
" [169]\n",
" [25]\n",
" [17]\n",
" [45]\n",
" [100]\n",
" [157]\n",
" [56]\n",
" [34]\n",
" [6]\n",
" [170]\n",
" [29]\n",
" [192]\n",
" [44]\n",
" [126]\n",
" [20]\n",
" [183]\n",
" [189]\n",
" [126]\n",
" [63]\n",
" [77]\n",
" [79]\n",
" [28]\n",
" [51]\n",
" [159]\n",
" [157]\n",
" [84]\n",
" [41]\n",
" [56]\n",
" [56]\n",
" [0]\n",
" [34]\n",
" [17]\n",
" [20]\n",
" [137]\n",
" [9]\n",
" [86]\n",
" [34]\n",
" [154]\n",
" [27]\n",
" [10]\n",
" [86]\n",
" [30]\n",
" [51]\n",
" [123]\n",
" [52]\n",
" [29]\n",
" [149]\n",
" [169]\n",
" [101]\n",
" [85]\n",
" [20]\n",
" [166]\n",
" [181]\n",
" [51]\n",
" [191]\n",
" [143]\n",
" [67]\n",
" [155]\n",
" [20]\n",
" [20]\n",
" [151]\n",
" [81]\n",
" [56]\n",
" [20]\n",
" [165]\n",
" [94]\n",
" [17]\n",
" [6]\n",
" [46]\n",
" [34]\n",
" [176]\n",
" [17]\n",
" [57]\n",
" [143]\n",
" [159]\n",
" [73]\n",
" [66]\n",
" [106]\n",
" [30]\n",
" [25]\n",
" [93]\n",
" [26]\n",
" [86]\n",
" [18]\n",
" [10]\n",
" [54]\n",
" [149]\n",
" [159]\n",
" [144]\n",
" [191]\n",
" [189]\n",
" [171]\n",
" [44]\n",
" [82]\n",
" [171]\n",
" [56]\n",
" [190]\n",
" [169]\n",
" [20]\n",
" [169]\n",
" [95]\n",
" [171]\n",
" [131]\n",
" [126]\n",
" [85]\n",
" [56]\n",
" [151]\n",
" [47]\n",
" [77]\n",
" [34]\n",
" [124]\n",
" [144]\n",
" [25]\n",
" [36]\n",
" [168]\n",
" [17]\n",
" [189]\n",
" [34]\n",
" [56]\n",
" [56]\n",
" [132]\n",
" [146]\n",
" [134]\n",
" [171]\n",
" [159]\n",
" [92]\n",
" [1]\n",
" [29]\n",
" [140]\n",
" [27]\n",
" [86]\n",
" [68]\n",
" [5]\n",
" [144]\n",
" [159]\n",
" [86]\n",
" [38]\n",
" [56]\n",
" [84]\n",
" [132]\n",
" [52]\n",
" [137]\n",
" [46]\n",
" [143]\n",
" [34]\n",
" [20]\n",
" [165]\n",
" [123]\n",
" [51]\n",
" [181]\n",
" [151]\n",
" [10]\n",
" [10]\n",
" [17]\n",
" [192]\n",
" [100]\n",
" [55]\n",
" [126]\n",
" [192]\n",
" [27]\n",
" [190]\n",
" [84]\n",
" [123]\n",
" [35]\n",
" [20]\n",
" [34]\n",
" [90]\n",
" [27]\n",
" [22]\n",
" [108]\n",
" [56]\n",
" [86]\n",
" [166]\n",
" [169]\n",
" [169]\n",
" [20]\n",
" [165]\n",
" [73]\n",
" [86]\n",
" [36]\n",
" [133]\n",
" [6]\n",
" [56]\n",
" [123]\n",
" [6]\n",
" [182]\n",
" [189]\n",
" [126]\n",
" [126]\n",
" [34]\n",
" [56]\n",
" [115]\n",
" [56]\n",
" [2]\n",
" [143]\n",
" [51]\n",
" [168]\n",
" [143]\n",
" [169]\n",
" [20]\n",
" [34]\n",
" [20]\n",
" [6]\n",
" [34]\n",
" [34]\n",
" [103]\n",
" [125]\n",
" [27]\n",
" [22]\n",
" [121]\n",
" [127]\n",
" [171]\n",
" [151]\n",
" [143]\n",
" [25]\n",
" [85]\n",
" [82]\n",
" [34]\n",
" [125]\n",
" [165]\n",
" [20]\n",
" [123]\n",
" [192]\n",
" [46]\n",
" [25]\n",
" [34]] [5 4 5 5 3 1 3 1 3 3 1 3 5 5 5 1 5 3 5 5 5 1 4 3 3 3 3 4 3 3 1 3 1 5 1 4 3\n",
" 5 3 3 1 3 1 1 1 3 1 3 3 1 1 1 3 3 5 1 3 3 5 1 3 1 3 3 3 3 1 4 3 5 5 1 3 3\n",
" 1 3 1 4 3 1 3 5 1 1 3 1 1 1 3 3 5 3 3 3 3 5 5 1 5 3 5 3 1 1 3 4 3 4 1 5 5\n",
" 5 1 3 1 3 4 3 1 3 1 1 3 5 1 3 4 5 3 3 3 3 1 3 4 1 3 3 4 3 5 1 5 1 3 3 3 1\n",
" 1 3 3 3 3 3 3 3 3 1 3 5 3 3 3 3 1 5 3 3 3 3 3 3 3 5 3 5 1 1 1 5 1 3 3 2 1\n",
" 3 3 3 3 1 3 1 5 5 3 1 5 3 1 1 1 1 3 1 4 3 5 3 3 3 3 1 1 3 1 1 5 5 3 1 3 1\n",
" 5 3 5 3 3 1 1 5 3 3 3 1 1 3 1 1 1 3 3 1 3 1 3 5 3 3 3 1 3 1 1 3 5 3 3 1 5\n",
" 3 5 1 1 1 5 3 5 5 3 5 3 3 3 1 5 1 3 5 1 1 5 3 3 1 1 3 5 1 3 3 5 3 3 3 3 3\n",
" 5 5 5 3 1 5 4 1 1 3 3 4 4 3 3 3 3 1 3 1 3 3 4 3 1 1 3 5 3 5 1 5 5 1 1 3 5\n",
" 3 1 3 3 5 1 1 5 1 5 1 4 3 3 3 1 3 1 5 3 5 1 1 1 1 3 5 3 3 1 1 4 3 3 3 3 1\n",
" 3 3 1 3 1 3 3 5 3 3 3 3 3 1 3 1 1 3 3 3 1 1 1 3 5 1 1 1 3 4 5 3 3 5 3 1 5\n",
" 3 3 1 3 3 3 3 3 4 3 3 1 1 3 3 3 1 1 1 3 3 3 5 3 3 3 5 3 1 3 3 1 1 3 1 3 1\n",
" 4 5 3 1 3 3 1 3 4 3 1 3 1 1 3 1 3 5 1 3 5 3 3 1 3 3 5 5 3 3 5 3 1 1 1 5 3\n",
" 3 3 3 3 1 1 1 3 3 5 1 3 3 1 3 3 3 3 5 5 5 1 5 3 5 1 1 1 5 3 5 5 3 3 3 1 3\n",
" 1 5 3 3 3 3 3 5 3 3 3 1 3 3 3 5 3 3 3 3 1 5 1 5 1 3 1 3 1 5 4 3 3 1 3 5 1\n",
" 3 3 3 3 3 3 1 1 1 1 3 1 1 1 3 5 5 4 3 1 3 5 3 3 3 1 1 1 1 3 3 3 5 5 3 1 3\n",
" 3 1 3 5 3 4 1 3 1 4 3 1 3 3 1 5 3 5 1 3 1 3 1 4 3 5 1 1 5 4 3 3 5 3 3 3 1\n",
" 1 3 3 5 1 3 1 1 5 3 1 1 1 1 3 1 5 1 3 5 3 1 1 3 3 5 5 1 1 5 3 3 1 1 3 1 3\n",
" 3 3 1 3 1 3 3 5 5 1 1 3 1 3 4 3 3 3 1 3 1 5 5 3 3 3 1 1 1 1 1 5 4 5 1 3 3\n",
" 3 1 3 3 3 3 3 3 1 3 1 5 1 5 1 3 1 1 3 1 3 5 1 1 3 3 5 3 1 1 1 1 3 1 0 3 5\n",
" 3 1 3 5 5 1 3 4 3 3 3 3 5 3 3 3 3 3 1 1 5 3 3 3 3 3 1 3 1 3 3 1 4 3 3 1 5\n",
" 3 3 3 3 4 1 1 5 5 1 1 1 5 1 3 1 5 3 4 3 3 3 1 5 3 1 3 1 3 1 5 3 1 1 3 1 3\n",
" 4 5 3 3 5 3 5 3 3 3 1 1 1 5 3 3 3 1 1 1 5 3 3 5 5 3 3 3 5 3 3 3 3 3 3 3 5\n",
" 5 3 4 3 3 3 4 5 1 3 4 0 1 3 3 5 1 3 5 3 3 3 3 1 3 3 3 3 3 4 1 3 1 5 3 5 1\n",
" 3 1 3 3 1 1 3 1 1 3 3 1 1 4 3 1 3 5 3 1 1 3 1 5 1 5 3 3 3 4 3 3 1 1 3 5 5\n",
" 5 3 3 0 3 3 3 4 3 3 1 3 3]\n"
]
}
],
"source": [
"'''Translate classification values into numerical values'''\n",
"Le = LabelEncoder() #Use the LabelEncoder library\n",
"for i in range(len(X[0])): #Iterate over instances of the data\n",
" X[:, i] = Le.fit_transform(X[:, i]) #Use fit_transform to convert values \n",
"\n",
"print(set(y))\n",
"yRealValues, y = np.unique(y, return_inverse=True)\n",
"\n",
"y = np.ravel(y) #Put the classification into a single 1D array to avoid future warning and error messages\n",
"print(X, y) #Ensure that the transformations have taken place correctly"
]
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {},
"outputs": [],
"source": [
"'''Simple function to train the models from the data'''\n",
"def trainModel(model):\n",
" model.fit(X_train, y_train) #Fit the model\n",
" prediction = model.predict(X_test) #Give the predictions for the y values\n",
" return round(metrics.accuracy_score(y_test, prediction), 3), classification_report(y_test, prediction) #Return the accuracy value and the report for the data"
]
},
{
"cell_type": "code",
"execution_count": 190,
"metadata": {},
"outputs": [],
"source": [
"knn = neighbors.KNeighborsClassifier(n_neighbors=11, weights='uniform') #Define the KNN algorithm"
]
},
{
"cell_type": "code",
"execution_count": 191,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #Split the dataset"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [],
"source": [
"def trainData(model):\n",
" start_time = time.time() #Set the starting execution time\n",
" acc, rep = trainModel(model) #set knn accuracy and report values on uncleaned data\n",
" finish_time = round(time.time() - start_time, 3)\n",
" print(\"{} seconds to run for {}\".format(finish_time, model)) #Display the runtime\n",
" return acc, rep"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.011 seconds to run for KNeighborsClassifier(n_neighbors=11)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\2011h\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n"
]
}
],
"source": [
"knn_acc, knn_rep = trainData(knn)"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.059 seconds to run for LogisticRegression()\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\2011h\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n",
"C:\\Users\\2011h\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n"
]
}
],
"source": [
"log_acc, log_rep = trainData(LogisticRegression())"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy = 0.463\n",
" precision recall f1-score support\n",
"\n",
" 0 0.00 0.00 0.00 1\n",
" 1 0.46 0.44 0.45 64\n",
" 3 0.49 0.73 0.58 81\n",
" 4 0.00 0.00 0.00 9\n",
" 5 0.00 0.00 0.00 33\n",
"\n",
" accuracy 0.46 188\n",
" macro avg 0.19 0.23 0.21 188\n",
"weighted avg 0.37 0.46 0.40 188\n",
"\n",
"Accuracy = 0.431\n",
" precision recall f1-score support\n",
"\n",
" 0 0.00 0.00 0.00 1\n",
" 1 0.00 0.00 0.00 64\n",
" 3 0.43 1.00 0.60 81\n",
" 4 0.00 0.00 0.00 9\n",
" 5 0.00 0.00 0.00 33\n",
"\n",
" accuracy 0.43 188\n",
" macro avg 0.09 0.20 0.12 188\n",
"weighted avg 0.19 0.43 0.26 188\n",
"\n"
]
}
],
"source": [
"print(\"Accuracy = {}\\n{}\".format(knn_acc, knn_rep)) #Display the accuracy and report for the entire dataset for knn\n",
"print(\"Accuracy = {}\\n{}\".format(log_acc, log_rep)) #Display the accuracy and report for the entire dataset for logistic regression"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler() #Define which scaler to use\n",
"scale_X = scaler.fit_transform(X) #Scale the entire dataset\n",
"X_train, X_test, y_train, y_test = train_test_split(scale_X, y, test_size=0.2) #Split the dataset"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.01 seconds to run for KNeighborsClassifier(n_neighbors=11)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\2011h\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n"
]
}
],
"source": [
"knn_acc, knn_rep = trainData(knn)"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.025 seconds to run for LogisticRegression()\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\2011h\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n"
]
}
],
"source": [
"log_acc, log_rep = trainData(LogisticRegression())"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy = 0.394\n",
" precision recall f1-score support\n",
"\n",
" 0 0.00 0.00 0.00 1\n",
" 1 0.36 0.31 0.33 59\n",
" 3 0.47 0.62 0.54 88\n",
" 4 0.00 0.00 0.00 6\n",
" 5 0.05 0.03 0.04 34\n",
"\n",
" accuracy 0.39 188\n",
" macro avg 0.18 0.19 0.18 188\n",
"weighted avg 0.34 0.39 0.36 188\n",
"\n",
"Accuracy = 0.468\n",
" precision recall f1-score support\n",
"\n",
" 0 0.00 0.00 0.00 1\n",
" 1 0.00 0.00 0.00 59\n",
" 3 0.47 1.00 0.64 88\n",
" 4 0.00 0.00 0.00 6\n",
" 5 0.00 0.00 0.00 34\n",
"\n",
" accuracy 0.47 188\n",
" macro avg 0.09 0.20 0.13 188\n",
"weighted avg 0.22 0.47 0.30 188\n",
"\n"
]
}
],
"source": [
"print(\"Accuracy = {}\\n{}\".format(knn_acc, knn_rep)) #Display the accuracy and report for the entire dataset for knn\n",
"print(\"Accuracy = {}\\n{}\".format(log_acc, log_rep)) #Display the accuracy and report for the entire dataset for logistic regression"
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {},
"outputs": [],
"source": [
"'''Plot a confusion matrix'''\n",
"def c_m():\n",
" start_time = time.time() #Set the starting execution time\n",
" y_pred = accpred() #Get the predicted values for y\n",
" matrix = confusion_matrix(y_test, y_pred) #Build a confusion matrix for the predicted and actual values\n",
" #real_vals = ['K', 'P', 'S', 'D', 'F', 'G', 'C', 'H', 'L', 'A', 'I', 'X', 'W', 'Z', 'U', 'B', 'J', 'V', 'O'] #Set the prediction values back to diagnosis\n",
" real_vals = ['A', 'B', 'C', 'D', 'E']\n",
" df_cm = pd.DataFrame(matrix, columns=np.unique(real_vals), index = np.unique(real_vals)) #Set the columns as the real values\n",
" df_cm.index.name = 'Actual' #Label the x axis as Actual\n",
" df_cm.columns.name = 'Predicted' #Label the y axis as Predicted\n",
" plt.figure(figsize = (10,7)) #Set the size\n",
" sn.set(font_scale=1.4) #Label size\n",
" sn.heatmap(df_cm, cmap=\"Blues\", annot=True,annot_kws={\"size\": 16}) #Font size\n",
" finish_time = round(time.time() - start_time, 3)\n",
" print(\"{} seconds to run\".format(finish_time)) #Display the runtime"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {},
"outputs": [],
"source": [
"def accpred():\n",
" model.fit(X_train, y_train) #Fit the model\n",
" return model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"KNN\n",
"0.07 seconds to run\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x504 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"'''Display the confusion matrix for knn'''\n",
"model = knn #Set the model as knn\n",
"print(\"KNN\")\n",
"c_m()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''Display the confusion matrix for logistic regession'''\n",
"model = LogisticRegression() #Set the model as logistic regression\n",
"print(\"Logistic Regression\")\n",
"c_m()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}