Data Wrangling - Aggregation on Multiple Columns

Contents

Data Wrangling - Aggregation on Multiple Columns#

Link to Source data #

Task: Sum the columns based on the prefix of the individual columns

from datatable import dt, f

df = dt.Frame(
    {
        "sn": [1, 2, 3],
        "C1-1": [4, 2, 1],
        "C1-2": [3, 2, 2],
        "C1-3": [5, 0, 0],
        "H2-1": [4, 2, 0],
        "H2-2": [1, 0, 2],
        "K3-1": [4, 1, 1],
        "K3-2": [2, 2, 2],
    }
)

df

	sn	C1-1	C1-2	C1-3	H2-1	H2-2	K3-1	K3-2
	▪▪▪▪	▪▪▪▪	▪▪▪▪	▪▪▪▪	▪▪▪▪	▪▪▪▪	▪▪▪▪	▪▪▪▪
0	1	4	3	5	4	1	4	2
1	2	2	2	0	2	0	1	2
2	3	1	2	0	0	2	1	2

Complete Solution#

# import libraries
from collections import defaultdict

# iterate to pair prefix with relevant columns
mapping = defaultdict(list)
for entry in df.names[1:]:
    key = f"total_{entry[:2]}"
    mapping[key].append(entry)

mapping

defaultdict(list,
            {'total_C1': ['C1-1', 'C1-2', 'C1-3'],
             'total_H2': ['H2-1', 'H2-2'],
             'total_K3': ['K3-1', 'K3-2']})

mapping = [f[value].rowsum().alias(key) for key, value in mapping.items()]

mapping

[FExpr<alias(rowsum(f[['C1-1', 'C1-2', 'C1-3']]), [total_C1,])>,
 FExpr<alias(rowsum(f[['H2-1', 'H2-2']]), [total_H2,])>,
 FExpr<alias(rowsum(f[['K3-1', 'K3-2']]), [total_K3,])>]

# actual computation occurs here
df[:, f.sn.extend(mapping)]

	sn	total_C1	total_H2	total_K3
	▪▪▪▪	▪▪▪▪	▪▪▪▪	▪▪▪▪
0	1	12	5	6
1	2	4	2	3
2	3	3	2	3

Comments#