From 65b046b1dd6e7a03b64b83027bcedd046fe3b521 Mon Sep 17 00:00:00 2001
From: "Ekeoma E. AGU" <113532349+Ekediee@users.noreply.github.com>
Date: Sun, 8 Jan 2023 00:31:06 +0000
Subject: [PATCH] generated frequent itemsets and their supports
---
src/02b.ipynb | 790 +++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 747 insertions(+), 43 deletions(-)
diff --git a/src/02b.ipynb b/src/02b.ipynb
index bc26e9b..447e675 100644
--- a/src/02b.ipynb
+++ b/src/02b.ipynb
@@ -44,7 +44,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "b1c664ab",
"metadata": {},
"outputs": [],
@@ -62,7 +62,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "b53e3b3f",
"metadata": {
"scrolled": true
@@ -86,13 +86,64 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"id": "d05752d1",
"metadata": {
"scrolled": true
},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'],\n",
+ " ['tropical fruit', 'yogurt', 'coffee'],\n",
+ " ['whole milk'],\n",
+ " ['pip fruit', 'yogurt', 'cream cheese', 'meat spreads'],\n",
+ " ['other vegetables',\n",
+ " 'whole milk',\n",
+ " 'condensed milk',\n",
+ " 'long life bakery product']]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "groceries[:5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "e3a8e11a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['citrus fruit,semi-finished bread,margarine,ready soups',\n",
+ " 'tropical fruit,yogurt,coffee',\n",
+ " 'whole milk',\n",
+ " 'pip fruit,yogurt,cream cheese,meat spreads',\n",
+ " 'other vegetables,whole milk,condensed milk,long life bakery product']"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "groceries2 = []\n",
+ "with open('groceries.csv', 'r') as csvfile:\n",
+ " #csv_reader = reader(csvfile)\n",
+ " for row in csvfile:\n",
+ " groceries2.append(row.strip())\n",
+ "\n",
+ "groceries2[:5]"
+ ]
},
{
"cell_type": "markdown",
@@ -112,7 +163,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "e53e450e",
"metadata": {},
"outputs": [],
@@ -130,11 +181,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "99669672",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "encoder = TransactionEncoder()"
+ ]
},
{
"cell_type": "markdown",
@@ -146,11 +199,31 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "9019f8e9",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[False, False, False, ..., False, False, False],\n",
+ " [False, False, False, ..., False, True, False],\n",
+ " [False, False, False, ..., True, False, False],\n",
+ " ...,\n",
+ " [False, False, False, ..., False, True, False],\n",
+ " [False, False, False, ..., False, False, False],\n",
+ " [False, False, False, ..., False, False, False]])"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "transactions = encoder.fit_transform(groceries)\n",
+ "transactions"
+ ]
},
{
"cell_type": "markdown",
@@ -162,7 +235,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "c5262ae7",
"metadata": {},
"outputs": [],
@@ -180,11 +253,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"id": "42ccf29b",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "itemsets = pd.DataFrame(transactions, columns=encoder.columns_)"
+ ]
},
{
"cell_type": "markdown",
@@ -196,13 +271,222 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "e83bab97",
"metadata": {
"scrolled": false
},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " abrasive cleaner | \n",
+ " artif. sweetener | \n",
+ " baby cosmetics | \n",
+ " baby food | \n",
+ " bags | \n",
+ " baking powder | \n",
+ " bathroom cleaner | \n",
+ " beef | \n",
+ " berries | \n",
+ " beverages | \n",
+ " ... | \n",
+ " uht-milk | \n",
+ " vinegar | \n",
+ " waffles | \n",
+ " whipped/sour cream | \n",
+ " whisky | \n",
+ " white bread | \n",
+ " white wine | \n",
+ " whole milk | \n",
+ " yogurt | \n",
+ " zwieback | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 169 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " abrasive cleaner artif. sweetener baby cosmetics baby food bags \\\n",
+ "0 False False False False False \n",
+ "1 False False False False False \n",
+ "2 False False False False False \n",
+ "3 False False False False False \n",
+ "4 False False False False False \n",
+ "\n",
+ " baking powder bathroom cleaner beef berries beverages ... uht-milk \\\n",
+ "0 False False False False False ... False \n",
+ "1 False False False False False ... False \n",
+ "2 False False False False False ... False \n",
+ "3 False False False False False ... False \n",
+ "4 False False False False False ... False \n",
+ "\n",
+ " vinegar waffles whipped/sour cream whisky white bread white wine \\\n",
+ "0 False False False False False False \n",
+ "1 False False False False False False \n",
+ "2 False False False False False False \n",
+ "3 False False False False False False \n",
+ "4 False False False False False False \n",
+ "\n",
+ " whole milk yogurt zwieback \n",
+ "0 False False False \n",
+ "1 False True False \n",
+ "2 True False False \n",
+ "3 False True False \n",
+ "4 True False False \n",
+ "\n",
+ "[5 rows x 169 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "itemsets.head()"
+ ]
},
{
"cell_type": "markdown",
@@ -214,13 +498,27 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"id": "92c5e631",
"metadata": {
"scrolled": true
},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 9835 entries, 0 to 9834\n",
+ "Columns: 169 entries, abrasive cleaner to zwieback\n",
+ "dtypes: bool(169)\n",
+ "memory usage: 1.6 MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "itemsets.info()"
+ ]
},
{
"cell_type": "markdown",
@@ -248,11 +546,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"id": "947456f2",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "from mlxtend.frequent_patterns import apriori"
+ ]
},
{
"cell_type": "markdown",
@@ -266,11 +566,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"id": "3db7d690",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "frequent_itemsets = apriori(itemsets, min_support=0.015, use_colnames=True)"
+ ]
},
{
"cell_type": "markdown",
@@ -282,13 +584,123 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"id": "046f5901",
"metadata": {
"scrolled": false
},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " support | \n",
+ " itemsets | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.017692 | \n",
+ " (baking powder) | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.052466 | \n",
+ " (beef) | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.033249 | \n",
+ " (berries) | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.026029 | \n",
+ " (beverages) | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.080529 | \n",
+ " (bottled beer) | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 175 | \n",
+ " 0.023183 | \n",
+ " (root vegetables, other vegetables, whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 176 | \n",
+ " 0.017082 | \n",
+ " (other vegetables, tropical fruit, whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 177 | \n",
+ " 0.022267 | \n",
+ " (other vegetables, yogurt, whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 178 | \n",
+ " 0.015557 | \n",
+ " (rolls/buns, whole milk, yogurt) | \n",
+ "
\n",
+ " \n",
+ " | 179 | \n",
+ " 0.015150 | \n",
+ " (yogurt, tropical fruit, whole milk) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
180 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " support itemsets\n",
+ "0 0.017692 (baking powder)\n",
+ "1 0.052466 (beef)\n",
+ "2 0.033249 (berries)\n",
+ "3 0.026029 (beverages)\n",
+ "4 0.080529 (bottled beer)\n",
+ ".. ... ...\n",
+ "175 0.023183 (root vegetables, other vegetables, whole milk)\n",
+ "176 0.017082 (other vegetables, tropical fruit, whole milk)\n",
+ "177 0.022267 (other vegetables, yogurt, whole milk)\n",
+ "178 0.015557 (rolls/buns, whole milk, yogurt)\n",
+ "179 0.015150 (yogurt, tropical fruit, whole milk)\n",
+ "\n",
+ "[180 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frequent_itemsets"
+ ]
},
{
"cell_type": "markdown",
@@ -300,11 +712,121 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"id": "bfba8e98",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " support | \n",
+ " itemsets | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 71 | \n",
+ " 0.255516 | \n",
+ " (whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 44 | \n",
+ " 0.193493 | \n",
+ " (other vegetables) | \n",
+ "
\n",
+ " \n",
+ " | 53 | \n",
+ " 0.183935 | \n",
+ " (rolls/buns) | \n",
+ "
\n",
+ " \n",
+ " | 60 | \n",
+ " 0.174377 | \n",
+ " (soda) | \n",
+ "
\n",
+ " \n",
+ " | 72 | \n",
+ " 0.139502 | \n",
+ " (yogurt) | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 163 | \n",
+ " 0.015252 | \n",
+ " (shopping bags, yogurt) | \n",
+ "
\n",
+ " \n",
+ " | 179 | \n",
+ " 0.015150 | \n",
+ " (yogurt, tropical fruit, whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 45 | \n",
+ " 0.015048 | \n",
+ " (pasta) | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 0.015048 | \n",
+ " (canned fish) | \n",
+ "
\n",
+ " \n",
+ " | 167 | \n",
+ " 0.015048 | \n",
+ " (whole milk, sugar) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
180 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " support itemsets\n",
+ "71 0.255516 (whole milk)\n",
+ "44 0.193493 (other vegetables)\n",
+ "53 0.183935 (rolls/buns)\n",
+ "60 0.174377 (soda)\n",
+ "72 0.139502 (yogurt)\n",
+ ".. ... ...\n",
+ "163 0.015252 (shopping bags, yogurt)\n",
+ "179 0.015150 (yogurt, tropical fruit, whole milk)\n",
+ "45 0.015048 (pasta)\n",
+ "11 0.015048 (canned fish)\n",
+ "167 0.015048 (whole milk, sugar)\n",
+ "\n",
+ "[180 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frequent_itemsets.sort_values('support', ascending=False)"
+ ]
},
{
"cell_type": "markdown",
@@ -318,11 +840,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"id": "48ad4460",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "length = frequent_itemsets['itemsets'].str.len()"
+ ]
},
{
"cell_type": "markdown",
@@ -334,13 +858,15 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"id": "8302997b",
"metadata": {
"scrolled": true
},
"outputs": [],
- "source": []
+ "source": [
+ "rows = length > 2"
+ ]
},
{
"cell_type": "markdown",
@@ -352,11 +878,88 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"id": "be48d27a",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " support | \n",
+ " itemsets | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 174 | \n",
+ " 0.017895 | \n",
+ " (other vegetables, rolls/buns, whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 175 | \n",
+ " 0.023183 | \n",
+ " (root vegetables, other vegetables, whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 176 | \n",
+ " 0.017082 | \n",
+ " (other vegetables, tropical fruit, whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 177 | \n",
+ " 0.022267 | \n",
+ " (other vegetables, yogurt, whole milk) | \n",
+ "
\n",
+ " \n",
+ " | 178 | \n",
+ " 0.015557 | \n",
+ " (rolls/buns, whole milk, yogurt) | \n",
+ "
\n",
+ " \n",
+ " | 179 | \n",
+ " 0.015150 | \n",
+ " (yogurt, tropical fruit, whole milk) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " support itemsets\n",
+ "174 0.017895 (other vegetables, rolls/buns, whole milk)\n",
+ "175 0.023183 (root vegetables, other vegetables, whole milk)\n",
+ "176 0.017082 (other vegetables, tropical fruit, whole milk)\n",
+ "177 0.022267 (other vegetables, yogurt, whole milk)\n",
+ "178 0.015557 (rolls/buns, whole milk, yogurt)\n",
+ "179 0.015150 (yogurt, tropical fruit, whole milk)"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frequent_itemsets[rows]"
+ ]
},
{
"cell_type": "markdown",
@@ -370,11 +973,112 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"id": "93699e5d",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " count | \n",
+ " mean | \n",
+ " std | \n",
+ " min | \n",
+ " 25% | \n",
+ " 50% | \n",
+ " 75% | \n",
+ " max | \n",
+ "
\n",
+ " \n",
+ " | itemsets | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " 73.0 | \n",
+ " 0.053441 | \n",
+ " 0.045956 | \n",
+ " 0.015048 | \n",
+ " 0.024504 | \n",
+ " 0.037112 | \n",
+ " 0.064870 | \n",
+ " 0.255516 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 101.0 | \n",
+ " 0.024799 | \n",
+ " 0.010058 | \n",
+ " 0.015048 | \n",
+ " 0.018404 | \n",
+ " 0.021047 | \n",
+ " 0.027555 | \n",
+ " 0.074835 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 6.0 | \n",
+ " 0.018522 | \n",
+ " 0.003417 | \n",
+ " 0.015150 | \n",
+ " 0.015938 | \n",
+ " 0.017489 | \n",
+ " 0.021174 | \n",
+ " 0.023183 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count mean std min 25% 50% 75% \\\n",
+ "itemsets \n",
+ "1 73.0 0.053441 0.045956 0.015048 0.024504 0.037112 0.064870 \n",
+ "2 101.0 0.024799 0.010058 0.015048 0.018404 0.021047 0.027555 \n",
+ "3 6.0 0.018522 0.003417 0.015150 0.015938 0.017489 0.021174 \n",
+ "\n",
+ " max \n",
+ "itemsets \n",
+ "1 0.255516 \n",
+ "2 0.074835 \n",
+ "3 0.023183 "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frequent_itemsets.groupby(length)['support'].describe()"
+ ]
},
{
"cell_type": "markdown",
@@ -755,7 +1459,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python 3",
"language": "python",
"name": "python3"
},
@@ -769,11 +1473,11 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.7"
+ "version": "3.10.4"
},
"vscode": {
"interpreter": {
- "hash": "4f946df053fbf2b937619d3c5458e7af74262f9a954d8797ba0b27400bcafe06"
+ "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
}
}
},