Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "4acc92e1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.feature_selection import RFECV\n",
"\n",
"df = pd.read_csv('for_python.csv', index_col = [0])\n",
"\n",
"y = df['Diabetes_binary']\n",
"X = df.drop (labels = ['Diabetes_binary'], axis =1)\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0) "
]
},
{
"cell_type": "markdown",
"id": "518ab4b4",
"metadata": {},
"source": [
"# Resampling"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8a9d3001",
"metadata": {},
"outputs": [],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"from imblearn.over_sampling import RandomOverSampler"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "68229bc2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Original dataset shape Counter({0: 217467, 1: 35025})\n",
"Y train Counter({0: 152288, 1: 24456})\n",
"Y test Counter({0: 65179, 1: 10569})\n",
"Y balanced only over sample Counter({0: 152288, 1: 60915})\n",
"Y balanced over and under sampling Counter({0: 121830, 1: 60915})\n"
]
}
],
"source": [
"print(f'Original dataset shape {Counter(y)}')\n",
"print(f'Y train {Counter(y_train)}')\n",
"print(f'Y test {Counter(y_test)}')\n",
"\n",
"over= RandomOverSampler(sampling_strategy=0.4)\n",
"under = RandomUnderSampler(sampling_strategy=0.5)\n",
"\n",
"X_balanced, y_balanced = over.fit_resample(X_train, y_train)\n",
"X_balanced2, y_balanced2 = under.fit_resample(X_balanced, y_balanced)\n",
"print(f'Y balanced only over sample {Counter(y_balanced)}')\n",
"print(f'Y balanced over and under sampling {Counter(y_balanced2)}')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "dac26d12",
"metadata": {},
"outputs": [],
"source": [
"df_oversampled = pd.concat([X_balanced, y_balanced], axis=1)\n",
"df_over_under_sampled = pd.concat([X_balanced2, y_balanced2], axis=1)\n",
"\n",
"df_oversampled.to_csv('df_oversampled.csv', index=False)\n",
"df_over_under_sampled.to_csv('df_over_under_sampled.csv', index=False)"
]
},
{
"cell_type": "markdown",
"id": "306fbbf5",
"metadata": {},
"source": [
"# RFE"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "aef0cc2e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The optimal number of features: 21\n",
"Best features: Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',\n",
" 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',\n",
" 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',\n",
" 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',\n",
" 'Income'],\n",
" dtype='object')\n"
]
}
],
"source": [
"rfc = RandomForestClassifier()\n",
"\n",
"#balanced_accuracy or accuracy\n",
"rfecv = RFECV(estimator=rfc, step=1, cv=5, scoring='accuracy', n_jobs=-1)\n",
"\n",
"rfecv.fit(X_balanced2, y_balanced2)\n",
"\n",
"print(\"The optimal number of features:\", rfecv.n_features_)\n",
"print(\"Best features:\", X_balanced2.columns[rfecv.support_])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9ca722d9",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#from Scikit-learn documentation\n",
"import matplotlib.pyplot as plt\n",
"\n",
"n_scores = len(rfecv.cv_results_[\"mean_test_score\"])\n",
"plt.figure()\n",
"plt.xlabel(\"Number of features selected\")\n",
"plt.ylabel(\"Mean test accuracy\")\n",
"plt.errorbar(\n",
" range(1, n_scores + 1),\n",
" rfecv.cv_results_[\"mean_test_score\"],\n",
" yerr=rfecv.cv_results_[\"std_test_score\"],\n",
")\n",
"plt.title(\"Recursive Feature Elimination \\nwith correlated features\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d0924bdb",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}