1033 lines
104 KiB
Plaintext
1033 lines
104 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# KMeans issues\n",
|
|
"> - Choose number of clusters *k* \n",
|
|
"> - Choose an **objective function** for selecting *k* \n",
|
|
"> - Try various values of *k* such that the **value of objective function** is maximized or minimized \n",
|
|
"> - **Objective function** is **Sum Squared Distance** \n",
|
|
"> - **Elbow method** is a popular way of choosing *k* \n",
|
|
"#### Reference: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Wholesale Data\n",
|
|
"### Feature 1: Types of products (Fresh, Milk, Grocery, Frozen, Detergents, Delicassen)\n",
|
|
"### Feature 2: Purchasing behavior (Channel and Region) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import matplotlib.pyplot as plt\n",
|
|
"%matplotlib inline\n",
|
|
"import pandas as pd\n",
|
|
"wholesale = pd.read_csv(\"Wholesale_customers_data.csv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Channel</th>\n",
|
|
" <th>Region</th>\n",
|
|
" <th>Fresh</th>\n",
|
|
" <th>Milk</th>\n",
|
|
" <th>Grocery</th>\n",
|
|
" <th>Frozen</th>\n",
|
|
" <th>Detergents_Paper</th>\n",
|
|
" <th>Delicassen</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>12669</td>\n",
|
|
" <td>9656</td>\n",
|
|
" <td>7561</td>\n",
|
|
" <td>214</td>\n",
|
|
" <td>2674</td>\n",
|
|
" <td>1338</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>7057</td>\n",
|
|
" <td>9810</td>\n",
|
|
" <td>9568</td>\n",
|
|
" <td>1762</td>\n",
|
|
" <td>3293</td>\n",
|
|
" <td>1776</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>6353</td>\n",
|
|
" <td>8808</td>\n",
|
|
" <td>7684</td>\n",
|
|
" <td>2405</td>\n",
|
|
" <td>3516</td>\n",
|
|
" <td>7844</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>13265</td>\n",
|
|
" <td>1196</td>\n",
|
|
" <td>4221</td>\n",
|
|
" <td>6404</td>\n",
|
|
" <td>507</td>\n",
|
|
" <td>1788</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>22615</td>\n",
|
|
" <td>5410</td>\n",
|
|
" <td>7198</td>\n",
|
|
" <td>3915</td>\n",
|
|
" <td>1777</td>\n",
|
|
" <td>5185</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen\n",
|
|
"0 2 3 12669 9656 7561 214 2674 1338\n",
|
|
"1 2 3 7057 9810 9568 1762 3293 1776\n",
|
|
"2 2 3 6353 8808 7684 2405 3516 7844\n",
|
|
"3 1 3 13265 1196 4221 6404 507 1788\n",
|
|
"4 2 3 22615 5410 7198 3915 1777 5185"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"wholesale.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(440, 8)"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"wholesale.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"categorical_features = ['Channel', 'Region']\n",
|
|
"continuous_features = ['Fresh', 'Milk', 'Grocery', 'Frozen', \n",
|
|
" 'Detergents_Paper', 'Delicassen']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Fresh</th>\n",
|
|
" <th>Milk</th>\n",
|
|
" <th>Grocery</th>\n",
|
|
" <th>Frozen</th>\n",
|
|
" <th>Detergents_Paper</th>\n",
|
|
" <th>Delicassen</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>440.000000</td>\n",
|
|
" <td>440.000000</td>\n",
|
|
" <td>440.000000</td>\n",
|
|
" <td>440.000000</td>\n",
|
|
" <td>440.000000</td>\n",
|
|
" <td>440.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>12000.297727</td>\n",
|
|
" <td>5796.265909</td>\n",
|
|
" <td>7951.277273</td>\n",
|
|
" <td>3071.931818</td>\n",
|
|
" <td>2881.493182</td>\n",
|
|
" <td>1524.870455</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>12647.328865</td>\n",
|
|
" <td>7380.377175</td>\n",
|
|
" <td>9503.162829</td>\n",
|
|
" <td>4854.673333</td>\n",
|
|
" <td>4767.854448</td>\n",
|
|
" <td>2820.105937</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>3.000000</td>\n",
|
|
" <td>55.000000</td>\n",
|
|
" <td>3.000000</td>\n",
|
|
" <td>25.000000</td>\n",
|
|
" <td>3.000000</td>\n",
|
|
" <td>3.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>3127.750000</td>\n",
|
|
" <td>1533.000000</td>\n",
|
|
" <td>2153.000000</td>\n",
|
|
" <td>742.250000</td>\n",
|
|
" <td>256.750000</td>\n",
|
|
" <td>408.250000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>8504.000000</td>\n",
|
|
" <td>3627.000000</td>\n",
|
|
" <td>4755.500000</td>\n",
|
|
" <td>1526.000000</td>\n",
|
|
" <td>816.500000</td>\n",
|
|
" <td>965.500000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>16933.750000</td>\n",
|
|
" <td>7190.250000</td>\n",
|
|
" <td>10655.750000</td>\n",
|
|
" <td>3554.250000</td>\n",
|
|
" <td>3922.000000</td>\n",
|
|
" <td>1820.250000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>112151.000000</td>\n",
|
|
" <td>73498.000000</td>\n",
|
|
" <td>92780.000000</td>\n",
|
|
" <td>60869.000000</td>\n",
|
|
" <td>40827.000000</td>\n",
|
|
" <td>47943.000000</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Fresh Milk Grocery Frozen \\\n",
|
|
"count 440.000000 440.000000 440.000000 440.000000 \n",
|
|
"mean 12000.297727 5796.265909 7951.277273 3071.931818 \n",
|
|
"std 12647.328865 7380.377175 9503.162829 4854.673333 \n",
|
|
"min 3.000000 55.000000 3.000000 25.000000 \n",
|
|
"25% 3127.750000 1533.000000 2153.000000 742.250000 \n",
|
|
"50% 8504.000000 3627.000000 4755.500000 1526.000000 \n",
|
|
"75% 16933.750000 7190.250000 10655.750000 3554.250000 \n",
|
|
"max 112151.000000 73498.000000 92780.000000 60869.000000 \n",
|
|
"\n",
|
|
" Detergents_Paper Delicassen \n",
|
|
"count 440.000000 440.000000 \n",
|
|
"mean 2881.493182 1524.870455 \n",
|
|
"std 4767.854448 2820.105937 \n",
|
|
"min 3.000000 3.000000 \n",
|
|
"25% 256.750000 408.250000 \n",
|
|
"50% 816.500000 965.500000 \n",
|
|
"75% 3922.000000 1820.250000 \n",
|
|
"max 40827.000000 47943.000000 "
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"wholesale[continuous_features].describe()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1 = wholesale[continuous_features]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"> **fit** and **transform** are a pair for transforming the data. \n",
|
|
"> **Kmeans(n_clusters=k)** sets the model framework. \n",
|
|
"> **Kmeans(n_clusters=k).fit(data)** estimates the model. \n",
|
|
"> **Kmeans(n_clusters=k).fit(data).inertia_** provides the sum squared error. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'df1' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn [1], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocessing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MinMaxScaler \u001b[38;5;66;03m# (Value-Min)/(Max-Min)\u001b[39;00m\n\u001b[1;32m 2\u001b[0m mms \u001b[38;5;241m=\u001b[39m MinMaxScaler()\n\u001b[0;32m----> 3\u001b[0m mms\u001b[38;5;241m.\u001b[39mfit(\u001b[43mdf1\u001b[49m)\n\u001b[1;32m 4\u001b[0m df1 \u001b[38;5;241m=\u001b[39m mms\u001b[38;5;241m.\u001b[39mtransform(df1)\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'df1' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.preprocessing import MinMaxScaler # (Value-Min)/(Max-Min)\n",
|
|
"mms = MinMaxScaler()\n",
|
|
"mms.fit(df1)\n",
|
|
"df1 = mms.transform(df1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.cluster import KMeans\n",
|
|
"kmeans = KMeans(n_clusters=5)\n",
|
|
"kmeans.fit(df1)\n",
|
|
"y_kmeans = kmeans.predict(df1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Fresh</th>\n",
|
|
" <th>Milk</th>\n",
|
|
" <th>Grocery</th>\n",
|
|
" <th>Frozen</th>\n",
|
|
" <th>Detergents_Paper</th>\n",
|
|
" <th>Delicassen</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0.079315</td>\n",
|
|
" <td>0.041126</td>\n",
|
|
" <td>0.041647</td>\n",
|
|
" <td>0.043862</td>\n",
|
|
" <td>0.024727</td>\n",
|
|
" <td>0.021675</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0.142329</td>\n",
|
|
" <td>0.471842</td>\n",
|
|
" <td>0.523124</td>\n",
|
|
" <td>0.049793</td>\n",
|
|
" <td>0.609254</td>\n",
|
|
" <td>0.061322</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0.322837</td>\n",
|
|
" <td>0.068403</td>\n",
|
|
" <td>0.061179</td>\n",
|
|
" <td>0.098276</td>\n",
|
|
" <td>0.022222</td>\n",
|
|
" <td>0.045945</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0.051815</td>\n",
|
|
" <td>0.145674</td>\n",
|
|
" <td>0.183377</td>\n",
|
|
" <td>0.023605</td>\n",
|
|
" <td>0.180126</td>\n",
|
|
" <td>0.038898</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>0.310117</td>\n",
|
|
" <td>0.412728</td>\n",
|
|
" <td>0.182103</td>\n",
|
|
" <td>0.800021</td>\n",
|
|
" <td>0.018433</td>\n",
|
|
" <td>0.558469</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Fresh Milk Grocery Frozen Detergents_Paper Delicassen\n",
|
|
"1 0.079315 0.041126 0.041647 0.043862 0.024727 0.021675\n",
|
|
"2 0.142329 0.471842 0.523124 0.049793 0.609254 0.061322\n",
|
|
"3 0.322837 0.068403 0.061179 0.098276 0.022222 0.045945\n",
|
|
"4 0.051815 0.145674 0.183377 0.023605 0.180126 0.038898\n",
|
|
"5 0.310117 0.412728 0.182103 0.800021 0.018433 0.558469"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"centers = kmeans.cluster_centers_\n",
|
|
"pd.DataFrame(centers,columns=continuous_features,index=range(1,6))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create Dummy for Categorical Attributes\n",
|
|
"> Both **set(data)** and **data.unique()** give unique elements in data. \n",
|
|
"> Suppose $x$ has 3 categories {1,2,3}: \n",
|
|
"> **get_dummies(x, prefix='VAR' )** gives $VAR_1$, $VAR_2$, and $VAR_3$. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{1, 2}\n",
|
|
"{1, 2, 3}\n",
|
|
"<class 'set'>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# There are: 3 values of 'Channel'={1,2} & 2 values of 'Region'={1,2,3}\n",
|
|
"print(set(wholesale['Channel']))\n",
|
|
"print(set(wholesale['Region']))\n",
|
|
"print(type(set(wholesale['Channel'])))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[2 1]\n",
|
|
"[3 1 2]\n",
|
|
"<class 'numpy.ndarray'>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(wholesale['Channel'].unique())\n",
|
|
"print(wholesale['Region'].unique())\n",
|
|
"print(type(wholesale['Channel'].unique()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Channel_1</th>\n",
|
|
" <th>Channel_2</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Channel_1 Channel_2\n",
|
|
"0 0 1\n",
|
|
"1 0 1\n",
|
|
"2 0 1\n",
|
|
"3 1 0\n",
|
|
"4 0 1"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pd.get_dummies(wholesale['Channel'], prefix='Channel' ).head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['Channel', 'Region']\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Fresh</th>\n",
|
|
" <th>Milk</th>\n",
|
|
" <th>Grocery</th>\n",
|
|
" <th>Frozen</th>\n",
|
|
" <th>Detergents_Paper</th>\n",
|
|
" <th>Delicassen</th>\n",
|
|
" <th>Channel_1</th>\n",
|
|
" <th>Channel_2</th>\n",
|
|
" <th>Region_1</th>\n",
|
|
" <th>Region_2</th>\n",
|
|
" <th>Region_3</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>12669</td>\n",
|
|
" <td>9656</td>\n",
|
|
" <td>7561</td>\n",
|
|
" <td>214</td>\n",
|
|
" <td>2674</td>\n",
|
|
" <td>1338</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>7057</td>\n",
|
|
" <td>9810</td>\n",
|
|
" <td>9568</td>\n",
|
|
" <td>1762</td>\n",
|
|
" <td>3293</td>\n",
|
|
" <td>1776</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>6353</td>\n",
|
|
" <td>8808</td>\n",
|
|
" <td>7684</td>\n",
|
|
" <td>2405</td>\n",
|
|
" <td>3516</td>\n",
|
|
" <td>7844</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>13265</td>\n",
|
|
" <td>1196</td>\n",
|
|
" <td>4221</td>\n",
|
|
" <td>6404</td>\n",
|
|
" <td>507</td>\n",
|
|
" <td>1788</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>22615</td>\n",
|
|
" <td>5410</td>\n",
|
|
" <td>7198</td>\n",
|
|
" <td>3915</td>\n",
|
|
" <td>1777</td>\n",
|
|
" <td>5185</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Fresh Milk Grocery Frozen Detergents_Paper Delicassen Channel_1 \\\n",
|
|
"0 12669 9656 7561 214 2674 1338 0 \n",
|
|
"1 7057 9810 9568 1762 3293 1776 0 \n",
|
|
"2 6353 8808 7684 2405 3516 7844 0 \n",
|
|
"3 13265 1196 4221 6404 507 1788 1 \n",
|
|
"4 22615 5410 7198 3915 1777 5185 0 \n",
|
|
"\n",
|
|
" Channel_2 Region_1 Region_2 Region_3 \n",
|
|
"0 1 0 0 1 \n",
|
|
"1 1 0 0 1 \n",
|
|
"2 1 0 0 1 \n",
|
|
"3 0 0 0 1 \n",
|
|
"4 1 0 0 1 "
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# col represents 'Channel' and 'Region'\n",
|
|
"# These 2 words are set as prefix\n",
|
|
"# Combine original data with the 5 new dummies\n",
|
|
"# After creating new dummies, drop them off from the original data\n",
|
|
"print(categorical_features)\n",
|
|
"\n",
|
|
"for i in categorical_features:\n",
|
|
" dummies = pd.get_dummies(wholesale[i], prefix=i)\n",
|
|
" wholesale = pd.concat([wholesale, dummies], axis=1)\n",
|
|
" wholesale.drop(i, axis=1, inplace=True)\n",
|
|
" \n",
|
|
"wholesale.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/data.py:334: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by MinMaxScaler.\n",
|
|
" return self.partial_fit(X, y)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"mms = MinMaxScaler()\n",
|
|
"mms.fit(wholesale)\n",
|
|
"data = mms.transform(wholesale)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Initialize a container for Sum Squared Error\n",
|
|
"\n",
|
|
"sqerror = []\n",
|
|
"K = range(1,15)\n",
|
|
"for k in K:\n",
|
|
" km = KMeans(n_clusters=k) # Specify model options\n",
|
|
" km = km.fit(data) # Fit the data\n",
|
|
" sqerror.append(km.inertia_)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"plt.plot(K, sqerror, 'bx-')\n",
|
|
"plt.xlabel('Number of Clusters (k)')\n",
|
|
"plt.ylabel('Sum Squared Distance')\n",
|
|
"plt.title('Elbow Method')\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Activity 1\n",
|
|
"> Use the following data to estimate the KMeans. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(300, 2)\n",
|
|
"(300,)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.datasets.samples_generator import make_blobs\n",
|
|
"X, y_true = make_blobs(n_samples=300, centers=4,\n",
|
|
" cluster_std=0.60, random_state=0)\n",
|
|
"print(X.shape)\n",
|
|
"print(y_true.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"plt.scatter(X[:, 0], X[:, 1], s=50)\n",
|
|
"plt.xlabel('X0')\n",
|
|
"plt.ylabel('X1');"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.cluster import KMeans\n",
|
|
"kmeans = KMeans(n_clusters=4)\n",
|
|
"kmeans.fit(X)\n",
|
|
"y_kmeans = kmeans.predict(X)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>X0</th>\n",
|
|
" <th>X1</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0.949735</td>\n",
|
|
" <td>4.419069</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>1.982583</td>\n",
|
|
" <td>0.867713</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>-1.584385</td>\n",
|
|
" <td>2.830813</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>-1.373244</td>\n",
|
|
" <td>7.753689</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" X0 X1\n",
|
|
"1 0.949735 4.419069\n",
|
|
"2 1.982583 0.867713\n",
|
|
"3 -1.584385 2.830813\n",
|
|
"4 -1.373244 7.753689"
|
|
]
|
|
},
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"centers = kmeans.cluster_centers_\n",
|
|
"pd.DataFrame(centers,columns=['X0','X1'], index=range(1,5))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')\n",
|
|
"\n",
|
|
"plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|