Associated Material
Module: Module 05 - Transforming data
Readings
as.numeric
janitor
package to create cleaned dataframe
column namesTidy data principles:
pivot_longer
from the tidyr
packagepivot_wider
from the tidyr
package
How to combine datasets together
rbind
cbind
cbind
and maintaining data
integrityAdding additional columns onto data sets by using a common identifier (key) between datasets to maintain data relationships
inner_join
from dplyr
will join datasets
on a key and at the same time filter so that only rows where there is a
matching key will be keptleft_join
and right_join
from
dplyr
will join datasets on a key and keep all rows from
the specified ‘side’ of the join, and filling in the non-matching
entries with NA
full_join
from dplyr
will join datasets on
a key and keep all rows from both datasets filling in columns of
non-matching entries with NA
flights
and planes
from the
nycflights13
package, how many flights were flown in April
2013 by aircraft with each engine type? e.g. Turbo-fan, Turbo-jet,
etc.
facet_grid
)library(tidyverse)
rodents <- read_csv("data/rodents_untidy.csv")
#> Rows: 41 Columns: 6
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): Plot location, Date collected, Family, Genus, Species, Weight
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# a)
rodents <- rodents %>% janitor::clean_names(case = "small_camel")
# b)
rodents$weight[rodents$weight %in% c("-999", "?")] <- NA
rodents$weight <- as.numeric(rodents$weight)
# c)
rodents <- rodents %>% separate(plotLocation, into = c("plot", "location"), sep = "_")
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 7 rows [35, 36,
#> 37, 38, 39, 40, 41].
# d)
rodents$plot <- as.numeric(rodents$plot)
# e)
plot1_2 <- rodents %>%
filter(plot ==1 | plot == 2)
plot_3 <- rodents %>%
filter(plot == 3) %>%
select(-genus) %>%
separate(species, into = c("genus", "species"), sep=" ")
rodents_clean <- rbind(plot1_2, plot_3)
rodents_clean
#> # A tibble: 41 × 7
#> plot location dateCollected family genus species weight
#> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 1 slope 01/09/14 Heteromyidae Dipodomys merriami 40
#> 2 1 slope 01/09/14 Heteromyidae Dipodomys merriami 36
#> 3 1 slope 01/09/14 Heteromyidae Dipodomys spectabilis 135
#> 4 1 rocks 01/09/14 Heteromyidae Dipodomys merriami 39
#> 5 1 grass 01/20/14 Heteromyidae Dipodomys merriami 43
#> 6 1 rocks 01/20/14 Heteromyidae Dipodomys spectabilis 144
#> 7 1 rocks 03/13/14 Heteromyidae Dipodomys merriami 51
#> 8 1 fence 03/13/14 Heteromyidae Dipodomys merriami 44
#> 9 1 fence 03/13/14 Heteromyidae Dipodomys spectabilis 146
#> 10 2 rocks 01/08/14 Cricetidae Neotoma albigula NA
#> # … with 31 more rows
library(tidyverse)
library(nycflights13)
flights %>%
filter(month == 3 & year == 2013 & !is.na(arr_time)) %>%
inner_join(planes, by = "tailnum") %>%
group_by(engine) %>%
summarise(n_flights = n())
#> # A tibble: 6 × 2
#> engine n_flights
#> <chr> <int>
#> 1 4 Cycle 3
#> 2 Reciprocating 165
#> 3 Turbo-fan 20114
#> 4 Turbo-jet 3391
#> 5 Turbo-prop 9
#> 6 Turbo-shaft 48
library(tidyverse)
gapminder_yearly_pop <- read_csv("data/gapminder_yearly_population_millions_total.csv") %>% mutate(across(-country, ))
gapminder_yearly_pop_long <- gapminder_yearly_pop %>% pivot_longer(-country, names_to = "year", values_to = "population_millions") %>% mutate(year = as.numeric(year))
# a)
nz_1900_2000 <- gapminder_yearly_pop_long %>%
filter(between(year, 1900, 2000)) %>%
filter(country == "New Zealand")
# b)
jamaica_1900_2000 <- gapminder_yearly_pop_long %>%
filter(between(year, 1900, 2000)) %>%
filter(country == "Jamaica")
# a)
nz_1900_2000 %>% ggplot(aes(x = year, y = population_millions)) + geom_line() + labs(title = "NZ population 1900-2000")
jamaica_1900_2000 %>% ggplot(aes(x = year, y = population_millions)) + geom_line() + labs(title = "Jamaica population 1900-2000")
# b)
rbind(nz_1900_2000, jamaica_1900_2000) %>%
ggplot(aes(x = year, y = population_millions)) +
geom_line() +
facet_grid(rows = "country") +
labs(title = " NZ vs Jamaica population 1900-2000")