Dplyr’s across: Replicating within Polars#
dplyr has the across function, which is meant to make column wise processing easy. This article aims to replicate solutions in the dplyr column-wise operations vignette with Polars.
Let’s load in the relevant libraries
import polars as pl
import polars.selectors as cs
import sys
print(" polars version :", pl.__version__, "\n", "python version :", sys.version)
polars version : 1.7.1
python version : 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]
# https://raw.githubusercontent.com/tidyverse/dplyr/main/data-raw/starwars.csv
starwars = pl.read_csv('Data_files/starwars.csv')
starwars.head()
shape: (5, 14)
name | height | mass | hair_color | skin_color | eye_color | birth_year | sex | gender | homeworld | species | films | vehicles | starships |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | f64 | f64 | str | str | str | f64 | str | str | str | str | str | str | str |
"Luke Skywalker" | 172.0 | 77.0 | "blond" | "fair" | "blue" | 19.0 | "male" | "masculine" | "Tatooine" | "Human" | "A New Hope, The Empire Strikes… | "Snowspeeder, Imperial Speeder … | "X-wing, Imperial shuttle" |
"C-3PO" | 167.0 | 75.0 | null | "gold" | "yellow" | 112.0 | "none" | "masculine" | "Tatooine" | "Droid" | "A New Hope, The Empire Strikes… | null | null |
"R2-D2" | 96.0 | 32.0 | null | "white, blue" | "red" | 33.0 | "none" | "masculine" | "Naboo" | "Droid" | "A New Hope, The Empire Strikes… | null | null |
"Darth Vader" | 202.0 | 136.0 | "none" | "white" | "yellow" | 41.9 | "male" | "masculine" | "Tatooine" | "Human" | "A New Hope, The Empire Strikes… | null | "TIE Advanced x1" |
"Leia Organa" | 150.0 | 49.0 | "brown" | "light" | "brown" | 19.0 | "female" | "feminine" | "Alderaan" | "Human" | "A New Hope, The Empire Strikes… | "Imperial Speeder Bike" | null |
# dplyr :
# starwars %>%
# summarise(across(where(is.character), n_distinct))
starwars.select(cs.string().n_unique())
shape: (1, 11)
name | hair_color | skin_color | eye_color | sex | gender | homeworld | species | films | vehicles | starships |
---|---|---|---|---|---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
87 | 12 | 31 | 15 | 5 | 3 | 49 | 38 | 24 | 11 | 16 |
# dplyr :
# starwars %>%
# group_by(species) %>%
# filter(n() > 1) %>%
# summarise(across(c(sex, gender, homeworld), n_distinct))
(starwars
.group_by("species")
.agg(pl.n_unique("sex", "gender", "homeworld"),
pl.len().alias("n")
)
.filter(pl.col("n").gt(1))
.select(pl.exclude("n"))
)
shape: (9, 4)
species | sex | gender | homeworld |
---|---|---|---|
str | u32 | u32 | u32 |
"Wookiee" | 1 | 1 | 1 |
"Mirialan" | 1 | 1 | 1 |
"Human" | 2 | 2 | 15 |
"Zabrak" | 1 | 1 | 2 |
"Twi'lek" | 2 | 2 | 1 |
null | 1 | 1 | 3 |
"Droid" | 1 | 2 | 3 |
"Gungan" | 1 | 1 | 1 |
"Kaminoan" | 2 | 2 | 1 |
# dplyr :
# starwars %>%
# group_by(homeworld) %>%
# filter(n() > 1) %>%
# summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))
(starwars
.group_by("homeworld")
.agg(cs.numeric().mean(), pl.len().alias("n"))
.filter(pl.col("n").gt(1))
.select(pl.exclude("n"))
)
shape: (10, 4)
homeworld | height | mass | birth_year |
---|---|---|---|
str | f64 | f64 | f64 |
"Coruscant" | 173.666667 | 50.0 | 91.0 |
"Mirial" | 168.0 | 53.1 | 49.0 |
"Kamino" | 208.333333 | 83.1 | 31.5 |
"Naboo" | 177.272727 | 64.166667 | 55.0 |
"Tatooine" | 169.8 | 85.375 | 54.644444 |
null | 138.75 | 82.0 | 334.333333 |
"Ryloth" | 179.0 | 55.0 | 48.0 |
"Alderaan" | 176.333333 | 64.0 | 43.0 |
"Kashyyyk" | 231.0 | 124.0 | 200.0 |
"Corellia" | 175.0 | 78.5 | 25.0 |
# dplyr:
# starwars %>% distinct(across(contains("color")))
starwars.select(cs.ends_with('color')).unique()
shape: (67, 3)
hair_color | skin_color | eye_color |
---|---|---|
str | str | str |
"none" | "white, blue" | "black" |
"brown, grey" | "light" | "blue" |
"auburn, white" | "fair" | "blue-gray" |
"none" | "grey, blue" | "unknown" |
"none" | "pale" | "pink" |
… | … | … |
"blonde" | "fair, green, yellow" | "yellow" |
"none" | "brown" | "yellow" |
"black" | "tan" | "brown" |
"brown" | "fair" | "brown" |
"none" | "pale" | "orange" |
# dplyr
# starwars %>% count(across(contains("color")), sort = TRUE)
(starwars
.select(pl.struct(cs.ends_with('color')).value_counts())
.unnest('hair_color')
.unnest('hair_color')
)
shape: (67, 4)
hair_color | skin_color | eye_color | count |
---|---|---|---|
str | str | str | u32 |
"black" | "dark" | "dark" | 1 |
null | "white, red" | "red" | 1 |
"none" | "none" | "unknown" | 1 |
"brown" | "light" | "blue" | 1 |
"none" | "red" | "yellow" | 1 |
… | … | … | … |
"none" | "white, blue" | "black" | 1 |
"brown" | "fair" | "blue" | 4 |
"none" | "green" | "yellow" | 1 |
null | "gold" | "yellow" | 1 |
"black" | "tan" | "brown" | 2 |
# starwars %>%
# filter(if_any(everything(), ~ !is.na(.x)))
starwars.filter(pl.any_horizontal(pl.all().is_not_null()))
shape: (87, 14)
name | height | mass | hair_color | skin_color | eye_color | birth_year | sex | gender | homeworld | species | films | vehicles | starships |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | f64 | f64 | str | str | str | f64 | str | str | str | str | str | str | str |
"Luke Skywalker" | 172.0 | 77.0 | "blond" | "fair" | "blue" | 19.0 | "male" | "masculine" | "Tatooine" | "Human" | "A New Hope, The Empire Strikes… | "Snowspeeder, Imperial Speeder … | "X-wing, Imperial shuttle" |
"C-3PO" | 167.0 | 75.0 | null | "gold" | "yellow" | 112.0 | "none" | "masculine" | "Tatooine" | "Droid" | "A New Hope, The Empire Strikes… | null | null |
"R2-D2" | 96.0 | 32.0 | null | "white, blue" | "red" | 33.0 | "none" | "masculine" | "Naboo" | "Droid" | "A New Hope, The Empire Strikes… | null | null |
"Darth Vader" | 202.0 | 136.0 | "none" | "white" | "yellow" | 41.9 | "male" | "masculine" | "Tatooine" | "Human" | "A New Hope, The Empire Strikes… | null | "TIE Advanced x1" |
"Leia Organa" | 150.0 | 49.0 | "brown" | "light" | "brown" | 19.0 | "female" | "feminine" | "Alderaan" | "Human" | "A New Hope, The Empire Strikes… | "Imperial Speeder Bike" | null |
… | … | … | … | … | … | … | … | … | … | … | … | … | … |
"Finn" | null | null | "black" | "dark" | "dark" | null | "male" | "masculine" | null | "Human" | "The Force Awakens" | null | null |
"Rey" | null | null | "brown" | "light" | "hazel" | null | "female" | "feminine" | null | "Human" | "The Force Awakens" | null | null |
"Poe Dameron" | null | null | "brown" | "light" | "brown" | null | "male" | "masculine" | null | "Human" | "The Force Awakens" | null | "X-wing" |
"BB8" | null | null | "none" | "none" | "black" | null | "none" | "masculine" | null | "Droid" | "The Force Awakens" | null | null |
"Captain Phasma" | null | null | "none" | "none" | "unknown" | null | "female" | "feminine" | null | "Human" | "The Force Awakens" | null | null |
Let’s look at another solution, from Stack Overflow:
# "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
cars = pl.read_csv('Data_files/cars.csv')
cars.head()
shape: (5, 12)
model | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb |
---|---|---|---|---|---|---|---|---|---|---|---|
str | f64 | i64 | f64 | i64 | f64 | f64 | f64 | i64 | i64 | i64 | i64 |
"Mazda RX4" | 21.0 | 6 | 160.0 | 110 | 3.9 | 2.62 | 16.46 | 0 | 1 | 4 | 4 |
"Mazda RX4 Wag" | 21.0 | 6 | 160.0 | 110 | 3.9 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
"Datsun 710" | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.32 | 18.61 | 1 | 1 | 4 | 1 |
"Hornet 4 Drive" | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
"Hornet Sportabout" | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.44 | 17.02 | 0 | 0 | 3 | 2 |
# dplyr:
# dat <- group_by(mtcars, cyl)
# summarize(dat, across(ends_with('p'), sum), across(ends_with('t'), mean))
cars.group_by('cyl').agg(cs.ends_with('p').mean(), cs.ends_with('t').sum())
shape: (3, 5)
cyl | disp | hp | drat | wt |
---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 |
4 | 105.136364 | 82.636364 | 44.78 | 25.143 |
8 | 353.1 | 209.214286 | 45.21 | 55.989 |
6 | 183.314286 | 122.285714 | 25.1 | 21.82 |
Comments#