{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Wrangling - Aggregation on Multiple Columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### [Link to Source data](https://stackoverflow.com/questions/64903216/append-new-column-to-df-after-sum)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Task: Sum the columns based on the prefix of the individual columns"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" | sn | C1-1 | C1-2 | C1-3 | H2-1 | H2-2 | K3-1 | K3-2 |
\n",
" | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ |
\n",
" \n",
" \n",
" 0 | 1 | 4 | 3 | 5 | 4 | 1 | 4 | 2 |
\n",
" 1 | 2 | 2 | 2 | 0 | 2 | 0 | 1 | 2 |
\n",
" 2 | 3 | 1 | 2 | 0 | 0 | 2 | 1 | 2 |
\n",
" \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datatable import dt, f\n",
"\n",
"df = dt.Frame(\n",
" {\n",
" \"sn\": [1, 2, 3],\n",
" \"C1-1\": [4, 2, 1],\n",
" \"C1-2\": [3, 2, 2],\n",
" \"C1-3\": [5, 0, 0],\n",
" \"H2-1\": [4, 2, 0],\n",
" \"H2-2\": [1, 0, 2],\n",
" \"K3-1\": [4, 1, 1],\n",
" \"K3-2\": [2, 2, 2],\n",
" }\n",
")\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Complete Solution**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(list,\n",
" {'total_C1': ['C1-1', 'C1-2', 'C1-3'],\n",
" 'total_H2': ['H2-1', 'H2-2'],\n",
" 'total_K3': ['K3-1', 'K3-2']})"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import libraries\n",
"from collections import defaultdict\n",
"\n",
"# iterate to pair prefix with relevant columns\n",
"mapping = defaultdict(list)\n",
"for entry in df.names[1:]:\n",
" key = f\"total_{entry[:2]}\"\n",
" mapping[key].append(entry)\n",
"\n",
"mapping"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[FExpr,\n",
" FExpr,\n",
" FExpr]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mapping = [f[value].rowsum().alias(key) for key, value in mapping.items()]\n",
"\n",
"mapping"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" | sn | total_C1 | total_H2 | total_K3 |
\n",
" | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪ |
\n",
" \n",
" \n",
" 0 | 1 | 12 | 5 | 6 |
\n",
" 1 | 2 | 4 | 2 | 3 |
\n",
" 2 | 3 | 3 | 2 | 3 |
\n",
" \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# actual computation occurs here\n",
"df[:, f.sn.extend(mapping)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Comments\n",
""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 4
}