name: module4 class: title-slide, right, middle, hide-count, hide-logo background-image: url("https://images.unsplash.com/photo-1591903934817-c02acbfd4665?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1932&q=80") background-size: cover <br> <br> <br> <br> # .black.big-text[Data<br>Wrangling] ## .black[Session - 4] .footnote[ .white[Image credits:][Karina L](https://unsplash.com/photos/zJRQ5nngdPA) ] --- class: center # Course Progress <img src="images/data-science-wrangle.png" width="100%" style="display: block; margin: auto;" /> --- # What is Data wrangling? -- - "data exploration and data manipulation" [(Jesse Mostipak)](https://www.kaggle.com/jessemostipak/dive-into-dplyr-tutorial-1) -- - "tidying and transforming" [(Hadley & Garrett)](https://r4ds.had.co.nz/index.html) -- <img src="images/tidy-1.png" width="100%" style="display: block; margin: auto;" /> --- # "Transforming" data means: - "narrowing in on observations of interest ... -- - creating new variables that are functions of existing variables ... and -- - calculating a set of summary statistics." .footnote[[Source](https://r4ds.had.co.nz/index.html)] --- class: hide-count, middle, hide-logo background-image: url(images/dplyr.svg) background-size: contain background-position: right # .big-text[R<br>Package] --- # `dplyr` package - "dplyr is a grammar of data manipulation" -- - "providing a consistent set of verbs that help you solve the most common data manipulation challenges:" -- - Few important functions: - `filter()` - `select()` - `mutate()` - `arrange()` - `summarise()` .footnote[ [Source](https://dplyr.tidyverse.org/) ] --- # `filter()` function: - Picks cases based on their values. <img src="images/01-filter.png" width="65%" style="display: block; margin: auto;" /> --- class: center, middle # How to have a data of only Gentoo penguins? --- .panelset[ .panel[.panel-name[Codes] ```r # there are three species: Chinstrap, Gentoo, Adelie penguins %>% * filter(species == "Gentoo") ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 124 × 8 ## species island bill_length_mm bill_depth_mm flipper_len…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Gentoo Biscoe 46.1 13.2 211 4500 fema… 2007 ## 2 Gentoo Biscoe 50 16.3 230 5700 male 2007 ## 3 Gentoo Biscoe 48.7 14.1 210 4450 fema… 2007 ## 4 Gentoo Biscoe 50 15.2 218 5700 male 2007 ## 5 Gentoo Biscoe 47.6 14.5 215 5400 male 2007 ## 6 Gentoo Biscoe 46.5 13.5 210 4550 fema… 2007 ## 7 Gentoo Biscoe 45.4 14.6 211 4800 fema… 2007 ## 8 Gentoo Biscoe 46.7 15.3 219 5200 male 2007 ## 9 Gentoo Biscoe 43.3 13.4 209 4400 fema… 2007 ## 10 Gentoo Biscoe 46.8 15.4 215 5150 male 2007 ## # … with 114 more rows, and abbreviated variable names ¹flipper_length_mm, ## # ²body_mass_g ``` ] ] --- .panelset[ .panel[.panel-name[Codes] ```r # there are three species: Chinstrap, Gentoo, Adelie praw <- read_csv("data/gentoo-penguins1.csv") praw %>% filter(species == "Gentoo") %>% summary() %>% kableExtra::kable() ``` ] .panel[.panel-name[Output] <table> <thead> <tr> <th style="text-align:left;"> </th> <th style="text-align:left;"> species </th> <th style="text-align:left;"> island </th> <th style="text-align:left;"> bill_length_mm </th> <th style="text-align:left;"> bill_depth_mm </th> <th style="text-align:left;"> flipper_length_mm </th> <th style="text-align:left;"> body_mass_g </th> <th style="text-align:left;"> sex </th> <th style="text-align:left;"> year </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> </td> <td style="text-align:left;"> Length:124 </td> <td style="text-align:left;"> Length:124 </td> <td style="text-align:left;"> Min. :40.90 </td> <td style="text-align:left;"> Min. :13.10 </td> <td style="text-align:left;"> Min. :203.0 </td> <td style="text-align:left;"> Min. :3950 </td> <td style="text-align:left;"> Length:124 </td> <td style="text-align:left;"> Min. :2007 </td> </tr> <tr> <td style="text-align:left;"> </td> <td style="text-align:left;"> Class :character </td> <td style="text-align:left;"> Class :character </td> <td style="text-align:left;"> 1st Qu.:45.30 </td> <td style="text-align:left;"> 1st Qu.:14.20 </td> <td style="text-align:left;"> 1st Qu.:212.0 </td> <td style="text-align:left;"> 1st Qu.:4500 </td> <td style="text-align:left;"> Class :character </td> <td style="text-align:left;"> 1st Qu.:2007 </td> </tr> <tr> <td style="text-align:left;"> </td> <td style="text-align:left;"> Mode :character </td> <td style="text-align:left;"> Mode :character </td> <td style="text-align:left;"> Median :47.30 </td> <td style="text-align:left;"> Median :15.00 </td> <td style="text-align:left;"> Median :216.0 </td> <td style="text-align:left;"> Median :4925 </td> <td style="text-align:left;"> Mode :character </td> <td style="text-align:left;"> Median :2008 </td> </tr> <tr> <td style="text-align:left;"> </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> Mean :47.50 </td> <td style="text-align:left;"> Mean :14.98 </td> <td style="text-align:left;"> Mean :217.2 </td> <td style="text-align:left;"> Mean :4985 </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> Mean :2008 </td> </tr> <tr> <td style="text-align:left;"> </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> 3rd Qu.:49.55 </td> <td style="text-align:left;"> 3rd Qu.:15.70 </td> <td style="text-align:left;"> 3rd Qu.:221.0 </td> <td style="text-align:left;"> 3rd Qu.:5400 </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> 3rd Qu.:2009 </td> </tr> <tr> <td style="text-align:left;"> </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> Max. :59.60 </td> <td style="text-align:left;"> Max. :17.30 </td> <td style="text-align:left;"> Max. :231.0 </td> <td style="text-align:left;"> Max. :6050 </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> Max. :2009 </td> </tr> <tr> <td style="text-align:left;"> </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> NA's :1 </td> <td style="text-align:left;"> NA's :1 </td> <td style="text-align:left;"> NA's :1 </td> <td style="text-align:left;"> NA's :1 </td> <td style="text-align:left;"> NA </td> <td style="text-align:left;"> NA </td> </tr> </tbody> </table> ] ] --- class: center, middle # How to export data file to your computer? --- .panelset[ .panel[.panel-name[Codes] ```r # three species are Chinstrap, Gentoo, Adelie penguins %>% filter(species == "Gentoo") %>% * write_csv("data/gentoo-penguins.csv") ``` ] .panel[.panel-name[Output] ] ] --- # ✋ WAIT! What is `%>% ` -- - this is called **pipe** ( `%>%` = control + shift + m) -- - "a powerful tool for clearly expressing a sequence of **multiple operations**" -- - interpret/read it as **then**. ```r penguins %>% filter(species == "Gentoo") %>% summary() %>% kableExtra::kable() ``` --- # Comparison: Relational Operators `x < y` -- `x > y` -- `x <= y` -- `x >= y` -- `x == y` (equal) -- `x != y` (not equal) --- class: center, middle # How to have a data of penguins with bill length more than 43 mm? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% filter(bill_length_mm > 43) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 188 × 8 ## species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Adelie Torgersen 46 21.5 194 4200 male 2007 ## 2 Adelie Dream 44.1 19.7 196 4400 male 2007 ## 3 Adelie Torgersen 45.8 18.9 197 4150 male 2008 ## 4 Adelie Dream 43.2 18.5 192 4100 male 2008 ## 5 Adelie Biscoe 43.2 19 197 4775 male 2009 ## 6 Adelie Biscoe 45.6 20.3 191 4600 male 2009 ## 7 Adelie Torgersen 44.1 18 210 4000 male 2009 ## 8 Adelie Torgersen 43.1 19.2 197 3500 male 2009 ## 9 Gentoo Biscoe 46.1 13.2 211 4500 fema… 2007 ## 10 Gentoo Biscoe 50 16.3 230 5700 male 2007 ## # … with 178 more rows, and abbreviated variable names ¹flipper_length_mm, ## # ²body_mass_g ``` ] ] --- class: center, middle # How to have a data of Gentoo penguins with bill length more than 55 mm? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% filter(species == "Gentoo", bill_length_mm > 55) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 3 × 8 ## species island bill_length_mm bill_depth_mm flipper_leng…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Gentoo Biscoe 59.6 17 230 6050 male 2007 ## 2 Gentoo Biscoe 55.9 17 228 5600 male 2009 ## 3 Gentoo Biscoe 55.1 16 230 5850 male 2009 ## # … with abbreviated variable names ¹flipper_length_mm, ²body_mass_g ``` ] ] --- class: center, middle # How to have data of non-Gentoo penguins with bill length more than 45 mm and weight more than 4 kg? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% filter(species != "Gentoo", bill_length_mm > 45, body_mass_g > 4000) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 18 × 8 ## species island bill_length_mm bill_depth_mm flippe…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Adelie Torgersen 46 21.5 194 4200 male 2007 ## 2 Adelie Torgersen 45.8 18.9 197 4150 male 2008 ## 3 Adelie Biscoe 45.6 20.3 191 4600 male 2009 ## 4 Chinstrap Dream 46 18.9 195 4150 fema… 2007 ## 5 Chinstrap Dream 52 18.1 201 4050 male 2007 ## 6 Chinstrap Dream 50.5 19.6 201 4050 male 2007 ## 7 Chinstrap Dream 49.2 18.2 195 4400 male 2007 ## 8 Chinstrap Dream 52 19 197 4150 male 2007 ## 9 Chinstrap Dream 52.8 20 205 4550 male 2008 ## 10 Chinstrap Dream 54.2 20.8 201 4300 male 2008 ## 11 Chinstrap Dream 51 18.8 203 4100 male 2008 ## 12 Chinstrap Dream 52 20.7 210 4800 male 2008 ## 13 Chinstrap Dream 53.5 19.9 205 4500 male 2008 ## 14 Chinstrap Dream 50.8 18.5 201 4450 male 2009 ## 15 Chinstrap Dream 49 19.6 212 4300 male 2009 ## 16 Chinstrap Dream 50.7 19.7 203 4050 male 2009 ## 17 Chinstrap Dream 49.3 19.9 203 4050 male 2009 ## 18 Chinstrap Dream 50.8 19 210 4100 male 2009 ## # … with abbreviated variable names ¹flipper_length_mm, ²body_mass_g ``` ] ] --- class: center, middle # How to have only top or bottom rows from data? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% filter(species != "Gentoo", bill_length_mm > 45, body_mass_g > 4000) %>% * head() ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 6 × 8 ## species island bill_length_mm bill_depth_mm flipper…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Adelie Torgersen 46 21.5 194 4200 male 2007 ## 2 Adelie Torgersen 45.8 18.9 197 4150 male 2008 ## 3 Adelie Biscoe 45.6 20.3 191 4600 male 2009 ## 4 Chinstrap Dream 46 18.9 195 4150 fema… 2007 ## 5 Chinstrap Dream 52 18.1 201 4050 male 2007 ## 6 Chinstrap Dream 50.5 19.6 201 4050 male 2007 ## # … with abbreviated variable names ¹flipper_length_mm, ²body_mass_g ``` ] ] --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% filter(species != "Gentoo", bill_length_mm > 45, body_mass_g > 4000) %>% * tail(3) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 3 × 8 ## species island bill_length_mm bill_depth_mm flipper_le…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Chinstrap Dream 50.7 19.7 203 4050 male 2009 ## 2 Chinstrap Dream 49.3 19.9 203 4050 male 2009 ## 3 Chinstrap Dream 50.8 19 210 4100 male 2009 ## # … with abbreviated variable names ¹flipper_length_mm, ²body_mass_g ``` ] ] --- class: your-turn, hide-logo # 🧠 YOUR TURN
−
+
10
:
00
.panelset[ .panel[.panel-name[Task] How many Chinstrap penguins are with bill length more than 45 mm and weight more than 4 kg? ] .panel[.panel-name[Codes] ```r penguins %>% filter(species == "Chinstrap", bill_length_mm > 45, body_mass_g > 4000) %>% head() ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 6 × 8 ## species island bill_length_mm bill_depth_mm flipper_le…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Chinstrap Dream 46 18.9 195 4150 fema… 2007 ## 2 Chinstrap Dream 52 18.1 201 4050 male 2007 ## 3 Chinstrap Dream 50.5 19.6 201 4050 male 2007 ## 4 Chinstrap Dream 49.2 18.2 195 4400 male 2007 ## 5 Chinstrap Dream 52 19 197 4150 male 2007 ## 6 Chinstrap Dream 52.8 20 205 4550 male 2008 ## # … with abbreviated variable names ¹flipper_length_mm, ²body_mass_g ``` ] ] --- # `select()` function: Chooses rows based on column values. <img src="images/03-select.png" width="60%" style="display: block; margin: auto;" /> --- class: center, middle # How to have only `species` variable in data? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% * select(species) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 1 ## species ## <fct> ## 1 Adelie ## 2 Adelie ## 3 Adelie ## 4 Adelie ## 5 Adelie ## 6 Adelie ## 7 Adelie ## 8 Adelie ## 9 Adelie ## 10 Adelie ## # … with 334 more rows ``` ] ] --- class: center, middle # How to have a specific range of variables in data? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% select(species : bill_depth_mm) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 4 ## species island bill_length_mm bill_depth_mm ## <fct> <fct> <dbl> <dbl> ## 1 Adelie Torgersen 39.1 18.7 ## 2 Adelie Torgersen 39.5 17.4 ## 3 Adelie Torgersen 40.3 18 ## 4 Adelie Torgersen NA NA ## 5 Adelie Torgersen 36.7 19.3 ## 6 Adelie Torgersen 39.3 20.6 ## 7 Adelie Torgersen 38.9 17.8 ## 8 Adelie Torgersen 39.2 19.6 ## 9 Adelie Torgersen 34.1 18.1 ## 10 Adelie Torgersen 42 20.2 ## # … with 334 more rows ``` ] ] --- class: center, middle # How to have variables based upon their location in data? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% select(4:8) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 5 ## bill_depth_mm flipper_length_mm body_mass_g sex year ## <dbl> <int> <int> <fct> <int> ## 1 18.7 181 3750 male 2007 ## 2 17.4 186 3800 female 2007 ## 3 18 195 3250 female 2007 ## 4 NA NA NA <NA> 2007 ## 5 19.3 193 3450 female 2007 ## 6 20.6 190 3650 male 2007 ## 7 17.8 181 3625 female 2007 ## 8 19.6 195 4675 male 2007 ## 9 18.1 193 3475 <NA> 2007 ## 10 20.2 190 4250 <NA> 2007 ## # … with 334 more rows ``` ] ] --- class: center, middle # How to have specific variables in data? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% select(species, body_mass_g, year) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 3 ## species body_mass_g year ## <fct> <int> <int> ## 1 Adelie 3750 2007 ## 2 Adelie 3800 2007 ## 3 Adelie 3250 2007 ## 4 Adelie NA 2007 ## 5 Adelie 3450 2007 ## 6 Adelie 3650 2007 ## 7 Adelie 3625 2007 ## 8 Adelie 4675 2007 ## 9 Adelie 3475 2007 ## 10 Adelie 4250 2007 ## # … with 334 more rows ``` ] ] --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% select(-c(species, body_mass_g, year)) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 5 ## island bill_length_mm bill_depth_mm flipper_length_mm sex ## <fct> <dbl> <dbl> <int> <fct> ## 1 Torgersen 39.1 18.7 181 male ## 2 Torgersen 39.5 17.4 186 female ## 3 Torgersen 40.3 18 195 female ## 4 Torgersen NA NA NA <NA> ## 5 Torgersen 36.7 19.3 193 female ## 6 Torgersen 39.3 20.6 190 male ## 7 Torgersen 38.9 17.8 181 female ## 8 Torgersen 39.2 19.6 195 male ## 9 Torgersen 34.1 18.1 193 <NA> ## 10 Torgersen 42 20.2 190 <NA> ## # … with 334 more rows ``` ] ] --- # `mutate()` function: Adds new variables that are functions of existing variables <img src="images/04-mutate.png" width="75%" style="display: block; margin: auto;" /> --- class: center, middle # How to convert penguin body mass from grams to kilograms? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% * mutate(body_mass_kg = body_mass_g / 1000) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 9 ## species island bill_length_mm bill_d…¹ flipp…² body_…³ sex year body_…⁴ ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> <dbl> ## 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007 3.75 ## 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007 3.8 ## 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007 3.25 ## 4 Adelie Torgersen NA NA NA NA <NA> 2007 NA ## 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007 3.45 ## 6 Adelie Torgersen 39.3 20.6 190 3650 male 2007 3.65 ## 7 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007 3.62 ## 8 Adelie Torgersen 39.2 19.6 195 4675 male 2007 4.68 ## 9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007 3.48 ## 10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007 4.25 ## # … with 334 more rows, and abbreviated variable names ¹bill_depth_mm, ## # ²flipper_length_mm, ³body_mass_g, ⁴body_mass_kg ``` ] ] --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% select(body_mass_g) %>% mutate(body_mass_kg = body_mass_g / 1000) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 2 ## body_mass_g body_mass_kg ## <int> <dbl> ## 1 3750 3.75 ## 2 3800 3.8 ## 3 3250 3.25 ## 4 NA NA ## 5 3450 3.45 ## 6 3650 3.65 ## 7 3625 3.62 ## 8 4675 4.68 ## 9 3475 3.48 ## 10 4250 4.25 ## # … with 334 more rows ``` ] ] --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% mutate(body_mass_kg = body_mass_g / 1000, bill = bill_length_mm * bill_depth_mm) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 10 ## species island bill_le…¹ bill_…² flipp…³ body_…⁴ sex year body_…⁵ bill ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> <dbl> <dbl> ## 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007 3.75 731. ## 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007 3.8 687. ## 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007 3.25 725. ## 4 Adelie Torgersen NA NA NA NA <NA> 2007 NA NA ## 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007 3.45 708. ## 6 Adelie Torgersen 39.3 20.6 190 3650 male 2007 3.65 810. ## 7 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007 3.62 692. ## 8 Adelie Torgersen 39.2 19.6 195 4675 male 2007 4.68 768. ## 9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007 3.48 617. ## 10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007 4.25 848. ## # … with 334 more rows, and abbreviated variable names ¹bill_length_mm, ## # ²bill_depth_mm, ³flipper_length_mm, ⁴body_mass_g, ⁵body_mass_kg ``` ] ] --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% mutate(body_mass_kg = body_mass_g / 1000, bill = bill_length_mm * bill_depth_mm) %>% select(body_mass_kg, bill) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 2 ## body_mass_kg bill ## <dbl> <dbl> ## 1 3.75 731. ## 2 3.8 687. ## 3 3.25 725. ## 4 NA NA ## 5 3.45 708. ## 6 3.65 810. ## 7 3.62 692. ## 8 4.68 768. ## 9 3.48 617. ## 10 4.25 848. ## # … with 334 more rows ``` ] ] --- # `arrange()` function: Changes the order of the rows. <img src="images/02-arrange.png" width="65%" style="display: block; margin: auto;" /> --- class: center, middle # How to have data arranged by the ascending order of bill length of penguins? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% * arrange(bill_length_mm) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 8 ## species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Adelie Dream 32.1 15.5 188 3050 fema… 2009 ## 2 Adelie Dream 33.1 16.1 178 2900 fema… 2008 ## 3 Adelie Torgersen 33.5 19 190 3600 fema… 2008 ## 4 Adelie Dream 34 17.1 185 3400 fema… 2008 ## 5 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007 ## 6 Adelie Torgersen 34.4 18.4 184 3325 fema… 2007 ## 7 Adelie Biscoe 34.5 18.1 187 2900 fema… 2008 ## 8 Adelie Torgersen 34.6 21.1 198 4400 male 2007 ## 9 Adelie Torgersen 34.6 17.2 189 3200 fema… 2008 ## 10 Adelie Biscoe 35 17.9 190 3450 fema… 2008 ## # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm, ## # ²body_mass_g ``` ] ] --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% * arrange(desc(bill_length_mm)) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 8 ## species island bill_length_mm bill_depth_mm flipper_l…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Gentoo Biscoe 59.6 17 230 6050 male 2007 ## 2 Chinstrap Dream 58 17.8 181 3700 fema… 2007 ## 3 Gentoo Biscoe 55.9 17 228 5600 male 2009 ## 4 Chinstrap Dream 55.8 19.8 207 4000 male 2009 ## 5 Gentoo Biscoe 55.1 16 230 5850 male 2009 ## 6 Gentoo Biscoe 54.3 15.7 231 5650 male 2008 ## 7 Chinstrap Dream 54.2 20.8 201 4300 male 2008 ## 8 Chinstrap Dream 53.5 19.9 205 4500 male 2008 ## 9 Gentoo Biscoe 53.4 15.8 219 5500 male 2009 ## 10 Chinstrap Dream 52.8 20 205 4550 male 2008 ## # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm, ## # ²body_mass_g ``` ] ] --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% arrange(species) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 344 × 8 ## species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year ## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int> ## 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007 ## 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007 ## 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007 ## 4 Adelie Torgersen NA NA NA NA <NA> 2007 ## 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007 ## 6 Adelie Torgersen 39.3 20.6 190 3650 male 2007 ## 7 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007 ## 8 Adelie Torgersen 39.2 19.6 195 4675 male 2007 ## 9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007 ## 10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007 ## # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm, ## # ²body_mass_g ``` ] ] --- class: center, middle, inverse # `summarise()` function --- # `summarise()` function: Chooses rows based on column values. <img src="images/05-summarise.png" width="75%" style="display: block; margin: auto;" /> --- class: center, middle # How to find mean bill length of all penguins? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% * drop_na() %>% * summarise(mean_bill_length_mm = mean(bill_length_mm)) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 1 × 1 ## mean_bill_length_mm ## <dbl> ## 1 44.0 ``` ] ] --- class: center, middle # How to find species-wise mean bill length of penguins? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% * drop_na() %>% * group_by(species) %>% summarise(mean_bill_length_mm = mean(bill_length_mm)) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 3 × 2 ## species mean_bill_length_mm ## <fct> <dbl> ## 1 Adelie 38.8 ## 2 Chinstrap 48.8 ## 3 Gentoo 47.6 ``` ] ] --- class: center, middle # How to find species-wise mean bill length of penguins and total number of penguins in each species? --- .panelset[ .panel[.panel-name[Codes] ```r penguins %>% drop_na() %>% group_by(species) %>% * summarise(mean_bill_length_mm = mean(bill_length_mm), n = n()) ``` ] .panel[.panel-name[Output] ``` ## # A tibble: 3 × 3 ## species mean_bill_length_mm n ## <fct> <dbl> <int> ## 1 Adelie 38.8 146 ## 2 Chinstrap 48.8 68 ## 3 Gentoo 47.6 119 ``` ] ] --- class: center middle hide-count # 🙋🏽♀️🙋♂️<br>.big-text[Q&A]