Dplyr’s across: Replicating within Polars

Contents

Dplyr’s across: Replicating within Polars#

dplyr has the across function, which is meant to make column wise processing easy. This article aims to replicate solutions in the dplyr column-wise operations vignette with Polars.

Let’s load in the relevant libraries

import polars as pl
import polars.selectors as cs
import sys

print(" polars version :", pl.__version__, "\n", "python version :", sys.version)
 polars version : 1.7.1 
 python version : 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]
# https://raw.githubusercontent.com/tidyverse/dplyr/main/data-raw/starwars.csv
starwars = pl.read_csv('Data_files/starwars.csv')
starwars.head()
shape: (5, 14)
nameheightmasshair_colorskin_coloreye_colorbirth_yearsexgenderhomeworldspeciesfilmsvehiclesstarships
strf64f64strstrstrf64strstrstrstrstrstrstr
"Luke Skywalker"172.077.0"blond""fair""blue"19.0"male""masculine""Tatooine""Human""A New Hope, The Empire Strikes…"Snowspeeder, Imperial Speeder …"X-wing, Imperial shuttle"
"C-3PO"167.075.0null"gold""yellow"112.0"none""masculine""Tatooine""Droid""A New Hope, The Empire Strikes…nullnull
"R2-D2"96.032.0null"white, blue""red"33.0"none""masculine""Naboo""Droid""A New Hope, The Empire Strikes…nullnull
"Darth Vader"202.0136.0"none""white""yellow"41.9"male""masculine""Tatooine""Human""A New Hope, The Empire Strikes…null"TIE Advanced x1"
"Leia Organa"150.049.0"brown""light""brown"19.0"female""feminine""Alderaan""Human""A New Hope, The Empire Strikes…"Imperial Speeder Bike"null
# dplyr :
# starwars %>%
#   summarise(across(where(is.character), n_distinct))
starwars.select(cs.string().n_unique())
shape: (1, 11)
namehair_colorskin_coloreye_colorsexgenderhomeworldspeciesfilmsvehiclesstarships
u32u32u32u32u32u32u32u32u32u32u32
87123115534938241116
# dplyr :
# starwars %>%
#   group_by(species) %>%
#   filter(n() > 1) %>%
#   summarise(across(c(sex, gender, homeworld), n_distinct))
(starwars
 .group_by("species")
 .agg(pl.n_unique("sex", "gender", "homeworld"), 
      pl.len().alias("n")
)
.filter(pl.col("n").gt(1))
.select(pl.exclude("n"))
)
shape: (9, 4)
speciessexgenderhomeworld
stru32u32u32
"Wookiee"111
"Mirialan"111
"Human"2215
"Zabrak"112
"Twi'lek"221
null113
"Droid"123
"Gungan"111
"Kaminoan"221
# dplyr :
# starwars %>%
#   group_by(homeworld) %>%
#   filter(n() > 1) %>%
#   summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))
(starwars
 .group_by("homeworld")
 .agg(cs.numeric().mean(), pl.len().alias("n"))
 .filter(pl.col("n").gt(1))
.select(pl.exclude("n"))
)
shape: (10, 4)
homeworldheightmassbirth_year
strf64f64f64
"Coruscant"173.66666750.091.0
"Mirial"168.053.149.0
"Kamino"208.33333383.131.5
"Naboo"177.27272764.16666755.0
"Tatooine"169.885.37554.644444
null138.7582.0334.333333
"Ryloth"179.055.048.0
"Alderaan"176.33333364.043.0
"Kashyyyk"231.0124.0200.0
"Corellia"175.078.525.0
# dplyr:
# starwars %>% distinct(across(contains("color")))
starwars.select(cs.ends_with('color')).unique()
shape: (67, 3)
hair_colorskin_coloreye_color
strstrstr
"none""white, blue""black"
"brown, grey""light""blue"
"auburn, white""fair""blue-gray"
"none""grey, blue""unknown"
"none""pale""pink"
"blonde""fair, green, yellow""yellow"
"none""brown""yellow"
"black""tan""brown"
"brown""fair""brown"
"none""pale""orange"
# dplyr
# starwars %>% count(across(contains("color")), sort = TRUE)
(starwars
 .select(pl.struct(cs.ends_with('color')).value_counts())
 .unnest('hair_color')
 .unnest('hair_color')
 )
shape: (67, 4)
hair_colorskin_coloreye_colorcount
strstrstru32
"black""dark""dark"1
null"white, red""red"1
"none""none""unknown"1
"brown""light""blue"1
"none""red""yellow"1
"none""white, blue""black"1
"brown""fair""blue"4
"none""green""yellow"1
null"gold""yellow"1
"black""tan""brown"2
# starwars %>%
#   filter(if_any(everything(), ~ !is.na(.x)))
starwars.filter(pl.any_horizontal(pl.all().is_not_null()))
shape: (87, 14)
nameheightmasshair_colorskin_coloreye_colorbirth_yearsexgenderhomeworldspeciesfilmsvehiclesstarships
strf64f64strstrstrf64strstrstrstrstrstrstr
"Luke Skywalker"172.077.0"blond""fair""blue"19.0"male""masculine""Tatooine""Human""A New Hope, The Empire Strikes…"Snowspeeder, Imperial Speeder …"X-wing, Imperial shuttle"
"C-3PO"167.075.0null"gold""yellow"112.0"none""masculine""Tatooine""Droid""A New Hope, The Empire Strikes…nullnull
"R2-D2"96.032.0null"white, blue""red"33.0"none""masculine""Naboo""Droid""A New Hope, The Empire Strikes…nullnull
"Darth Vader"202.0136.0"none""white""yellow"41.9"male""masculine""Tatooine""Human""A New Hope, The Empire Strikes…null"TIE Advanced x1"
"Leia Organa"150.049.0"brown""light""brown"19.0"female""feminine""Alderaan""Human""A New Hope, The Empire Strikes…"Imperial Speeder Bike"null
"Finn"nullnull"black""dark""dark"null"male""masculine"null"Human""The Force Awakens"nullnull
"Rey"nullnull"brown""light""hazel"null"female""feminine"null"Human""The Force Awakens"nullnull
"Poe Dameron"nullnull"brown""light""brown"null"male""masculine"null"Human""The Force Awakens"null"X-wing"
"BB8"nullnull"none""none""black"null"none""masculine"null"Droid""The Force Awakens"nullnull
"Captain Phasma"nullnull"none""none""unknown"null"female""feminine"null"Human""The Force Awakens"nullnull

Let’s look at another solution, from Stack Overflow:

# "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
cars = pl.read_csv('Data_files/cars.csv')
cars.head()
shape: (5, 12)
modelmpgcyldisphpdratwtqsecvsamgearcarb
strf64i64f64i64f64f64f64i64i64i64i64
"Mazda RX4"21.06160.01103.92.6216.460144
"Mazda RX4 Wag"21.06160.01103.92.87517.020144
"Datsun 710"22.84108.0933.852.3218.611141
"Hornet 4 Drive"21.46258.01103.083.21519.441031
"Hornet Sportabout"18.78360.01753.153.4417.020032
# dplyr:
# dat <- group_by(mtcars, cyl)
# summarize(dat, across(ends_with('p'), sum), across(ends_with('t'), mean))
cars.group_by('cyl').agg(cs.ends_with('p').mean(), cs.ends_with('t').sum())
shape: (3, 5)
cyldisphpdratwt
i64f64f64f64f64
4105.13636482.63636444.7825.143
8353.1209.21428645.2155.989
6183.314286122.28571425.121.82

Comments#