Dplyr’s across: Replicating within Pandas

Dplyr’s across: Replicating within Pandas#

dplyr has the across function, which is meant to make column wise processing easy. Pandas has always supported column-wise operations, and in a relatively simple manner.

This article aims to replicate solutions in the dplyr column-wise operations vignette with Pandas.

Let’s load in the relevant libraries

import pandas as pd
import numpy as np
import janitor as jn
import sys

print(" pandas version :", pd.__version__,"\n", 
      "numpy version :", np.__version__, "\n", 
      "janitor version :", jn.__version__,"\n",
      "python version :", sys.version)

 pandas version : 2.2.2 
 numpy version : 2.0.2 
 janitor version : 0.31.0 
 python version : 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]

url = "https://raw.githubusercontent.com/tidyverse/dplyr/main/data-raw/starwars.csv"
starwars = pd.read_csv(url)
starwars.head()

	name	height	mass	hair_color	skin_color	eye_color	birth_year	sex	gender	homeworld	species	films	vehicles	starships
0	Luke Skywalker	172.0	77.0	blond	fair	blue	19.0	male	masculine	Tatooine	Human	A New Hope, The Empire Strikes Back, Return of...	Snowspeeder, Imperial Speeder Bike	X-wing, Imperial shuttle
1	C-3PO	167.0	75.0	NaN	gold	yellow	112.0	none	masculine	Tatooine	Droid	A New Hope, The Empire Strikes Back, Return of...	NaN	NaN
2	R2-D2	96.0	32.0	NaN	white, blue	red	33.0	none	masculine	Naboo	Droid	A New Hope, The Empire Strikes Back, Return of...	NaN	NaN
3	Darth Vader	202.0	136.0	none	white	yellow	41.9	male	masculine	Tatooine	Human	A New Hope, The Empire Strikes Back, Return of...	NaN	TIE Advanced x1
4	Leia Organa	150.0	49.0	brown	light	brown	19.0	female	feminine	Alderaan	Human	A New Hope, The Empire Strikes Back, Return of...	Imperial Speeder Bike	NaN

# dplyr : 
# starwars %>% 
#   summarise(across(where(is.character), n_distinct))
(starwars
.select_dtypes('object')
.nunique(dropna=False))

name          87
hair_color    12
skin_color    31
eye_color     15
sex            5
gender         3
homeworld     49
species       38
films         24
vehicles      11
starships     16
dtype: int64

# dplyr :
# starwars %>% 
#   group_by(species) %>% 
#   filter(n() > 1) %>% 
#   summarise(across(c(sex, gender, homeworld), n_distinct))

grp = starwars.groupby('species')

(grp[['sex', 'gender', 'homeworld']]
 .nunique(dropna=False)
 .loc[grp.size().gt(1)]
 )

	sex	gender	homeworld
species
Droid	1	2	3
Gungan	1	1	1
Human	2	2	15
Kaminoan	2	2	1
Mirialan	1	1	1
Twi'lek	2	2	1
Wookiee	1	1	1
Zabrak	1	1	2

Another approach is with the pipe function:

(starwars
.filter(['species', 'sex', 'gender', 'homeworld'])
.groupby('species')
.pipe(lambda df: df.nunique(dropna = False).loc[df.size().gt(1)])
)

	sex	gender	homeworld
species
Droid	1	2	3
Gungan	1	1	1
Human	2	2	15
Kaminoan	2	2	1
Mirialan	1	1	1
Twi'lek	2	2	1
Wookiee	1	1	1
Zabrak	1	1	2

with janitor.summarise:

func = lambda df: df.nunique(dropna=False)
(starwars
 .summarise((['sex','gender','homeworld'], func), 
            {'n':('species','size')},
            by='species'
        )
 .loc[lambda df: df.n.gt(1), :'homeworld']
)

	sex	gender	homeworld
species
Human	2	2	15
Droid	1	2	3
Wookiee	1	1	1
Gungan	1	1	1
Zabrak	1	1	2
Twi'lek	2	2	1
Mirialan	1	1	1
Kaminoan	2	2	1

# dplyr : 
# starwars %>% 
#   group_by(homeworld) %>% 
#   filter(n() > 1) %>% 
#   summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))

# get the grouper
grp = (starwars
       .set_index('homeworld')
       .select_dtypes('number')
       .groupby('homeworld')
       )
# get the size:
n = grp.size()
# filter averages with the size:
grp.mean().loc[n.gt(1)]

	height	mass	birth_year
homeworld
Alderaan	176.333333	64.000000	43.000000
Corellia	175.000000	78.500000	25.000000
Coruscant	173.666667	50.000000	91.000000
Kamino	208.333333	83.100000	31.500000
Kashyyyk	231.000000	124.000000	200.000000
Mirial	168.000000	53.100000	49.000000
Naboo	177.272727	64.166667	55.000000
Ryloth	179.000000	55.000000	48.000000
Tatooine	169.800000	85.375000	54.644444

With the pipe function:

(starwars
.set_index('homeworld')
.select_dtypes('number')
.groupby('homeworld')
.pipe(lambda df: df.mean().loc[df.size().gt(1)])
)

	height	mass	birth_year
homeworld
Alderaan	176.333333	64.000000	43.000000
Corellia	175.000000	78.500000	25.000000
Coruscant	173.666667	50.000000	91.000000
Kamino	208.333333	83.100000	31.500000
Kashyyyk	231.000000	124.000000	200.000000
Mirial	168.000000	53.100000	49.000000
Naboo	177.272727	64.166667	55.000000
Ryloth	179.000000	55.000000	48.000000
Tatooine	169.800000	85.375000	54.644444

with janitor.summarise:

from pandas.api.types import is_numeric_dtype

(starwars
 .summarise((is_numeric_dtype, 'mean'), 
            {'n':('homeworld','size')},
            by='homeworld'
        )
 .loc[lambda df: df.n.gt(1), :'birth_year']
)

	height	mass	birth_year
homeworld
Tatooine	169.800000	85.375000	54.644444
Naboo	177.272727	64.166667	55.000000
Alderaan	176.333333	64.000000	43.000000
Kashyyyk	231.000000	124.000000	200.000000
Corellia	175.000000	78.500000	25.000000
Kamino	208.333333	83.100000	31.500000
Coruscant	173.666667	50.000000	91.000000
Ryloth	179.000000	55.000000	48.000000
Mirial	168.000000	53.100000	49.000000

An alternate approach, where the groupby is reused, is with pyjanitor’s get_columns function:

grp = starwars.groupby('homeworld')
(grp
 .pipe(jn.get_columns, is_numeric_dtype)
 .mean()
 .loc[grp.size().gt(1)]
 )

	height	mass	birth_year
homeworld
Alderaan	176.333333	64.000000	43.000000
Corellia	175.000000	78.500000	25.000000
Coruscant	173.666667	50.000000	91.000000
Kamino	208.333333	83.100000	31.500000
Kashyyyk	231.000000	124.000000	200.000000
Mirial	168.000000	53.100000	49.000000
Naboo	177.272727	64.166667	55.000000
Ryloth	179.000000	55.000000	48.000000
Tatooine	169.800000	85.375000	54.644444

# dplyr:
# starwars %>% distinct(across(contains("color")))
starwars.filter(like='color').drop_duplicates()

	hair_color	skin_color	eye_color
0	blond	fair	blue
1	NaN	gold	yellow
2	NaN	white, blue	red
3	none	white	yellow
4	brown	light	brown
...	...	...	...
80	none	pale	white
82	black	dark	dark
83	brown	light	hazel
85	none	none	black
86	none	none	unknown

67 rows × 3 columns

# dplyr
# starwars %>% count(across(contains("color")), sort = TRUE)
starwars.filter(like='color').value_counts(dropna = False)

hair_color    skin_color  eye_color
brown         light       brown        6
              fair        blue         4
none          grey        black        4
black         dark        brown        3
blond         fair        blue         3
                                      ..
none          dark        brown        1
              green       black        1
auburn, grey  fair        blue         1
none          green       red          1
NaN           white, red  red          1
Name: count, Length: 67, dtype: int64

# starwars %>% 
#   filter(if_any(everything(), ~ !is.na(.x)))
starwars.loc[starwars.notna().any(axis=1)]

	name	height	mass	hair_color	skin_color	eye_color	birth_year	sex	gender	homeworld	species	films	vehicles	starships
0	Luke Skywalker	172.0	77.0	blond	fair	blue	19.0	male	masculine	Tatooine	Human	A New Hope, The Empire Strikes Back, Return of...	Snowspeeder, Imperial Speeder Bike	X-wing, Imperial shuttle
1	C-3PO	167.0	75.0	NaN	gold	yellow	112.0	none	masculine	Tatooine	Droid	A New Hope, The Empire Strikes Back, Return of...	NaN	NaN
2	R2-D2	96.0	32.0	NaN	white, blue	red	33.0	none	masculine	Naboo	Droid	A New Hope, The Empire Strikes Back, Return of...	NaN	NaN
3	Darth Vader	202.0	136.0	none	white	yellow	41.9	male	masculine	Tatooine	Human	A New Hope, The Empire Strikes Back, Return of...	NaN	TIE Advanced x1
4	Leia Organa	150.0	49.0	brown	light	brown	19.0	female	feminine	Alderaan	Human	A New Hope, The Empire Strikes Back, Return of...	Imperial Speeder Bike	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
82	Finn	NaN	NaN	black	dark	dark	NaN	male	masculine	NaN	Human	The Force Awakens	NaN	NaN
83	Rey	NaN	NaN	brown	light	hazel	NaN	female	feminine	NaN	Human	The Force Awakens	NaN	NaN
84	Poe Dameron	NaN	NaN	brown	light	brown	NaN	male	masculine	NaN	Human	The Force Awakens	NaN	X-wing
85	BB8	NaN	NaN	none	none	black	NaN	none	masculine	NaN	Droid	The Force Awakens	NaN	NaN
86	Captain Phasma	NaN	NaN	none	none	unknown	NaN	female	feminine	NaN	Human	The Force Awakens	NaN	NaN

87 rows × 14 columns

# starwars %>% 
#   filter(if_any(everything(), ~ !is.na(.x)))
starwars.loc[starwars.notna().all(axis=1)]

	name	height	mass	hair_color	skin_color	eye_color	birth_year	sex	gender	homeworld	species	films	vehicles	starships
0	Luke Skywalker	172.0	77.0	blond	fair	blue	19.0	male	masculine	Tatooine	Human	A New Hope, The Empire Strikes Back, Return of...	Snowspeeder, Imperial Speeder Bike	X-wing, Imperial shuttle
9	Obi-Wan Kenobi	182.0	77.0	auburn, white	fair	blue-gray	57.0	male	masculine	Stewjon	Human	A New Hope, The Empire Strikes Back, Return of...	Tribubble bongo	Jedi starfighter, Trade Federation cruiser, Na...
10	Anakin Skywalker	188.0	84.0	blond	fair	blue	41.9	male	masculine	Tatooine	Human	The Phantom Menace, Attack of the Clones, Reve...	Zephyr-G swoop bike, XJ-6 airspeeder	Naboo fighter, Trade Federation cruiser, Jedi ...
12	Chewbacca	228.0	112.0	brown	unknown	blue	200.0	male	masculine	Kashyyyk	Wookiee	A New Hope, The Empire Strikes Back, Return of...	AT-ST	Millennium Falcon, Imperial shuttle
16	Wedge Antilles	170.0	77.0	brown	fair	hazel	21.0	male	masculine	Corellia	Human	A New Hope, The Empire Strikes Back, Return of...	Snowspeeder	X-wing
42	Darth Maul	175.0	80.0	none	red	yellow	54.0	male	masculine	Dathomir	Zabrak	The Phantom Menace	Sith speeder	Scimitar

Output for the above is different from the vignette’s because Pandas treats none as NaN, and dplyr does not (at least in the vignette)

Let’s look at another solution, from Stack Overflow:

url = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
cars = pd.read_csv(url)
cars.head()

	model	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
0	Mazda RX4	21.0	6	160.0	110	3.90	2.620	16.46	0	1	4	4
1	Mazda RX4 Wag	21.0	6	160.0	110	3.90	2.875	17.02	0	1	4	4
2	Datsun 710	22.8	4	108.0	93	3.85	2.320	18.61	1	1	4	1
3	Hornet 4 Drive	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
4	Hornet Sportabout	18.7	8	360.0	175	3.15	3.440	17.02	0	0	3	2

# dplyr:
# dat <- group_by(mtcars, cyl) 
# summarize(dat, across(ends_with('p'), sum), across(ends_with('t'), mean))
aggs = {key:"mean" if key.endswith("t") else "sum" for key in cars if key.endswith(('p', 't'))}
cars.groupby('cyl').agg(aggs)

	disp	hp	drat	wt
cyl
4	1156.5	909	4.070909	2.285727
6	1283.2	856	3.585714	3.117143
8	4943.4	2929	3.229286	3.999214

An alternate approach, where the groupby is reused, is with pyjanitor’s get_columns function:

grp = cars.groupby('cyl')
summary = [jn.get_columns(grp, '*p').sum(), jn.get_columns(grp, '*t').mean()]
pd.concat(summary, axis = 1)

	disp	hp	drat	wt
cyl
4	1156.5	909	4.070909	2.285727
6	1283.2	856	3.585714	3.117143
8	4943.4	2929	3.229286	3.999214

with janitor.summarise:

cars.summarise(("*p","sum"), ("*t","mean"),by='cyl')

	disp	hp	drat	wt
cyl
6	1283.2	856	3.585714	3.117143
4	1156.5	909	4.070909	2.285727
8	4943.4	2929	3.229286	3.999214

Dplyr’s across: Replicating within Pandas

Contents

Dplyr’s across: Replicating within Pandas#

Comments#