Dplyr’s across: Replicating within Polars

Dplyr’s across: Replicating within Polars#

dplyr has the across function, which is meant to make column wise processing easy. This article aims to replicate solutions in the dplyr column-wise operations vignette with Polars.

Let’s load in the relevant libraries

import polars as pl
import polars.selectors as cs
import sys

print(" polars version :", pl.__version__, "\n", "python version :", sys.version)

 polars version : 1.7.1 
 python version : 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]

# https://raw.githubusercontent.com/tidyverse/dplyr/main/data-raw/starwars.csv
starwars = pl.read_csv('Data_files/starwars.csv')
starwars.head()

shape: (5, 14)

name	height	mass	hair_color	skin_color	eye_color	birth_year	sex	gender	homeworld	species	films	vehicles	starships
str	f64	f64	str	str	str	f64	str	str	str	str	str	str	str
"Luke Skywalker"	172.0	77.0	"blond"	"fair"	"blue"	19.0	"male"	"masculine"	"Tatooine"	"Human"	"A New Hope, The Empire Strikes…	"Snowspeeder, Imperial Speeder …	"X-wing, Imperial shuttle"
"C-3PO"	167.0	75.0	null	"gold"	"yellow"	112.0	"none"	"masculine"	"Tatooine"	"Droid"	"A New Hope, The Empire Strikes…	null	null
"R2-D2"	96.0	32.0	null	"white, blue"	"red"	33.0	"none"	"masculine"	"Naboo"	"Droid"	"A New Hope, The Empire Strikes…	null	null
"Darth Vader"	202.0	136.0	"none"	"white"	"yellow"	41.9	"male"	"masculine"	"Tatooine"	"Human"	"A New Hope, The Empire Strikes…	null	"TIE Advanced x1"
"Leia Organa"	150.0	49.0	"brown"	"light"	"brown"	19.0	"female"	"feminine"	"Alderaan"	"Human"	"A New Hope, The Empire Strikes…	"Imperial Speeder Bike"	null

# dplyr :
# starwars %>%
#   summarise(across(where(is.character), n_distinct))
starwars.select(cs.string().n_unique())

shape: (1, 11)

name	hair_color	skin_color	eye_color	sex	gender	homeworld	species	films	vehicles	starships
u32	u32	u32	u32	u32	u32	u32	u32	u32	u32	u32
87	12	31	15	5	3	49	38	24	11	16

# dplyr :
# starwars %>%
#   group_by(species) %>%
#   filter(n() > 1) %>%
#   summarise(across(c(sex, gender, homeworld), n_distinct))
(starwars
 .group_by("species")
 .agg(pl.n_unique("sex", "gender", "homeworld"), 
      pl.len().alias("n")
)
.filter(pl.col("n").gt(1))
.select(pl.exclude("n"))
)

shape: (9, 4)

species	sex	gender	homeworld
str	u32	u32	u32
"Wookiee"	1	1	1
"Mirialan"	1	1	1
"Human"	2	2	15
"Zabrak"	1	1	2
"Twi'lek"	2	2	1
null	1	1	3
"Droid"	1	2	3
"Gungan"	1	1	1
"Kaminoan"	2	2	1

# dplyr :
# starwars %>%
#   group_by(homeworld) %>%
#   filter(n() > 1) %>%
#   summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))
(starwars
 .group_by("homeworld")
 .agg(cs.numeric().mean(), pl.len().alias("n"))
 .filter(pl.col("n").gt(1))
.select(pl.exclude("n"))
)

shape: (10, 4)

homeworld	height	mass	birth_year
str	f64	f64	f64
"Coruscant"	173.666667	50.0	91.0
"Mirial"	168.0	53.1	49.0
"Kamino"	208.333333	83.1	31.5
"Naboo"	177.272727	64.166667	55.0
"Tatooine"	169.8	85.375	54.644444
null	138.75	82.0	334.333333
"Ryloth"	179.0	55.0	48.0
"Alderaan"	176.333333	64.0	43.0
"Kashyyyk"	231.0	124.0	200.0
"Corellia"	175.0	78.5	25.0

# dplyr:
# starwars %>% distinct(across(contains("color")))
starwars.select(cs.ends_with('color')).unique()

shape: (67, 3)

hair_color	skin_color	eye_color
str	str	str
"none"	"white, blue"	"black"
"brown, grey"	"light"	"blue"
"auburn, white"	"fair"	"blue-gray"
"none"	"grey, blue"	"unknown"
"none"	"pale"	"pink"
…	…	…
"blonde"	"fair, green, yellow"	"yellow"
"none"	"brown"	"yellow"
"black"	"tan"	"brown"
"brown"	"fair"	"brown"
"none"	"pale"	"orange"

# dplyr
# starwars %>% count(across(contains("color")), sort = TRUE)
(starwars
 .select(pl.struct(cs.ends_with('color')).value_counts())
 .unnest('hair_color')
 .unnest('hair_color')
 )

shape: (67, 4)

hair_color	skin_color	eye_color	count
str	str	str	u32
"black"	"dark"	"dark"	1
null	"white, red"	"red"	1
"none"	"none"	"unknown"	1
"brown"	"light"	"blue"	1
"none"	"red"	"yellow"	1
…	…	…	…
"none"	"white, blue"	"black"	1
"brown"	"fair"	"blue"	4
"none"	"green"	"yellow"	1
null	"gold"	"yellow"	1
"black"	"tan"	"brown"	2

# starwars %>%
#   filter(if_any(everything(), ~ !is.na(.x)))
starwars.filter(pl.any_horizontal(pl.all().is_not_null()))

shape: (87, 14)

name	height	mass	hair_color	skin_color	eye_color	birth_year	sex	gender	homeworld	species	films	vehicles	starships
str	f64	f64	str	str	str	f64	str	str	str	str	str	str	str
"Luke Skywalker"	172.0	77.0	"blond"	"fair"	"blue"	19.0	"male"	"masculine"	"Tatooine"	"Human"	"A New Hope, The Empire Strikes…	"Snowspeeder, Imperial Speeder …	"X-wing, Imperial shuttle"
"C-3PO"	167.0	75.0	null	"gold"	"yellow"	112.0	"none"	"masculine"	"Tatooine"	"Droid"	"A New Hope, The Empire Strikes…	null	null
"R2-D2"	96.0	32.0	null	"white, blue"	"red"	33.0	"none"	"masculine"	"Naboo"	"Droid"	"A New Hope, The Empire Strikes…	null	null
"Darth Vader"	202.0	136.0	"none"	"white"	"yellow"	41.9	"male"	"masculine"	"Tatooine"	"Human"	"A New Hope, The Empire Strikes…	null	"TIE Advanced x1"
"Leia Organa"	150.0	49.0	"brown"	"light"	"brown"	19.0	"female"	"feminine"	"Alderaan"	"Human"	"A New Hope, The Empire Strikes…	"Imperial Speeder Bike"	null
…	…	…	…	…	…	…	…	…	…	…	…	…	…
"Finn"	null	null	"black"	"dark"	"dark"	null	"male"	"masculine"	null	"Human"	"The Force Awakens"	null	null
"Rey"	null	null	"brown"	"light"	"hazel"	null	"female"	"feminine"	null	"Human"	"The Force Awakens"	null	null
"Poe Dameron"	null	null	"brown"	"light"	"brown"	null	"male"	"masculine"	null	"Human"	"The Force Awakens"	null	"X-wing"
"BB8"	null	null	"none"	"none"	"black"	null	"none"	"masculine"	null	"Droid"	"The Force Awakens"	null	null
"Captain Phasma"	null	null	"none"	"none"	"unknown"	null	"female"	"feminine"	null	"Human"	"The Force Awakens"	null	null

Let’s look at another solution, from Stack Overflow:

# "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
cars = pl.read_csv('Data_files/cars.csv')
cars.head()

shape: (5, 12)

model	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
str	f64	i64	f64	i64	f64	f64	f64	i64	i64	i64	i64
"Mazda RX4"	21.0	6	160.0	110	3.9	2.62	16.46	0	1	4	4
"Mazda RX4 Wag"	21.0	6	160.0	110	3.9	2.875	17.02	0	1	4	4
"Datsun 710"	22.8	4	108.0	93	3.85	2.32	18.61	1	1	4	1
"Hornet 4 Drive"	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
"Hornet Sportabout"	18.7	8	360.0	175	3.15	3.44	17.02	0	0	3	2

# dplyr:
# dat <- group_by(mtcars, cyl)
# summarize(dat, across(ends_with('p'), sum), across(ends_with('t'), mean))
cars.group_by('cyl').agg(cs.ends_with('p').mean(), cs.ends_with('t').sum())

shape: (3, 5)

cyl	disp	hp	drat	wt
i64	f64	f64	f64	f64
4	105.136364	82.636364	44.78	25.143
8	353.1	209.214286	45.21	55.989
6	183.314286	122.285714	25.1	21.82

Dplyr’s across: Replicating within Polars

Contents

Dplyr’s across: Replicating within Polars#

Comments#