library(dplyr)
library(tidyr)
# data readin
<- read.csv('../data/lab4/RY.csv')
RY <- read.csv('../data/lab4/sp_id.csv')
id <- read.csv('../data/lab4/sp_stoi_traits.csv') trait
Lab 4: Data Wrangling
1 Lab Setup
# check the data
head(RY, 3) # different species and relative yield of the species (compared to grow alone)
sp1 sp2 RY1 RY2
1 1 2 0.004167904 1.1385059
2 1 3 0.007112204 0.9607456
3 1 4 0.519281008 1.6451093
head(id, 3) # species names for each ID
sp id
1 Ankistrodesmus falcatus 1
2 Botryococcus sudeticus 2
3 Chlamydocapsa ampla 3
head(trait, 3) # ratio of a speices element (C for carbon, N for nitrogen, and P for phosphorous)
Species.name C.N C.P N.P
1 Actinastrum hantzschii 10.53468 235.5864 22.36293
2 Ankistrodesmus falcatus 18.57258 435.5213 23.44970
3 Arthrodesmus convergens 15.25011 481.3848 31.56598
2 Mutate
2.1 Exercise 1
head(RY, 3)
sp1 sp2 RY1 RY2
1 1 2 0.004167904 1.1385059
2 1 3 0.007112204 0.9607456
3 1 4 0.519281008 1.6451093
<- mutate(RY, avgRY = (RY1 + RY2) / 2)
RY head(RY, 3)
sp1 sp2 RY1 RY2 avgRY
1 1 2 0.004167904 1.1385059 0.5713369
2 1 3 0.007112204 0.9607456 0.4839289
3 1 4 0.519281008 1.6451093 1.0821952
2.1.1 Question 1.1
What did we just do using the mutate()
function? What column was added? What does the column represent?
Answer: We used the mutate()
to create a new column avgRY
, which is the mean of the RY1
and RY2
.
2.1.2 Question 1.2
Please use the mutate function to create a new column that is the sum of RY1 and RY2. Name this new column RYTotal and name this new dataframe RY.t. Please do not add this new column to the RY dataset because if you do you will run into problems later in the lab!
<- RY %>%
RY.t mutate(RYTotal=RY1+RY2)
head(RY.t)
sp1 sp2 RY1 RY2 avgRY RYTotal
1 1 2 0.004167904 1.1385059 0.5713369 1.1426738
2 1 3 0.007112204 0.9607456 0.4839289 0.9678578
3 1 4 0.519281008 1.6451093 1.0821952 2.1643903
4 1 5 0.004955600 0.9128606 0.4589081 0.9178162
5 1 7 0.449101255 1.3225352 0.8858182 1.7716365
6 1 8 0.883576608 0.1783871 0.5309819 1.0619637
3 Merge/join
3.1 Practice Work
# join the RY and the id dataframe, get the species name
colnames(id)[2] <- "sp1" # This is a very complicated way of joining
<- left_join(RY, id, by = "sp1")
RY.m colnames(RY.m)[6] <- "sp1_name"
colnames(id)[2] <- "sp2"
<- left_join(RY.m, id, by = "sp2")
RY.m colnames(RY.m)[7] <- "sp2_name"
# left join the RY.m and the trait dataframe
colnames(trait)[1] <- "sp1_name"
<- left_join(RY.m, trait, by = "sp1_name")
RY.mtl mean(RY.mtl[, "C.N"], na.rm = TRUE)
[1] 13.92414
# inner join the RY.m and the trait dataframe
colnames(trait)[1] <- "sp1_name"
<- inner_join(RY.m, trait, by = "sp1_name")
RY.mti mean(RY.mti[, "C.N"])
[1] 13.92414
3.2 Exercise 2
3.2.1 Question 2.1
Please use right_join to join RY.m and trait by sp1_name and name this dataframe RY.mtr (please add RY.m and trait in the same order as you did in the two joins above).
<- right_join(RY.m, trait, by = 'sp1_name') RY.mtr
How does this dataframe compare to the left_join dataframe RY.mtl and the inner_join dataframe RY.mti?
Answer: The right_join()
result retains all observations in y (i.e. trait
in this case). Therefore, some species only exist in trait
but not in the RY.m
remains in the result, leading to the many ‘NA’ records in the RY.mtr
. The inner_join()
only keeps the observations both in x and y, thus giving the shortest result. And the left_join()
keeps all observations in x.
tail(RY.mtr)
sp1 sp2 RY1 RY2 avgRY sp1_name sp2_name C.N C.P
207 NA NA NA NA NA Staurastrum cingulum <NA> 13.959753 NA
208 NA NA NA NA NA Staurastrum punctulatum <NA> 12.948858 NA
209 NA NA NA NA NA Staurastrum tetracerum <NA> 13.965427 258.95257
210 NA NA NA NA NA Tetraedron minutum <NA> 16.242217 474.40228
211 NA NA NA NA NA Tetrastrum?heteracantum <NA> 14.211187 293.12711
212 NA NA NA NA NA Volvox tertius <NA> 8.166389 96.29642
N.P
207 NA
208 NA
209 18.54240
210 29.20798
211 20.62650
212 11.79180
4 Gather & Spread
4.1 Practice Work
# from wide to long
<- gather(RY, key = RY_type, value = value, RY1:avgRY)
RY.long head(RY.long, 3)
sp1 sp2 RY_type value
1 1 2 RY1 0.004167904
2 1 3 RY1 0.007112204
3 1 4 RY1 0.519281008
# from long to wide
<- spread(RY.long, key = RY_type, value = value)
RY.wide head(RY.wide, 3)
sp1 sp2 avgRY RY1 RY2
1 1 2 0.5713369 0.004167904 1.1385059
2 1 3 0.4839289 0.007112204 0.9607456
3 1 4 1.0821952 0.519281008 1.6451093
5 Subset/filter
5.1 Exercise 3
5.1.1 Question 3.1
Please use the subset function to subset your data for species number 17 in the species 2 column (sp2). Name this subset sp17. Now subset this data frame, sp17, to only include the column RY2.
<- subset(RY, sp2 == 17)
sp17 head(sp17, 3)
sp1 sp2 RY1 RY2 avgRY
14 1 17 0.007949608 1.0448059 0.5263777
25 2 17 0.808653223 0.3098087 0.5592310
38 3 17 0.113828498 0.7334857 0.4236571
<- subset(sp17, select=c(RY2))
sp17 head(sp17, 3)
RY2
14 1.0448059
25 0.3098087
38 0.7334857
5.2 Exercise 4
5.2.1 Question 4.1
<- filter(RY, sp2 == 17 & sp1 == 1)
sp_test1 head(sp_test1)
sp1 sp2 RY1 RY2 avgRY
1 1 17 0.007949608 1.044806 0.5263777
<- filter(RY, sp2 == 17 | sp1 == 1)
sp_test2 head(sp_test2)
sp1 sp2 RY1 RY2 avgRY
1 1 2 0.004167904 1.1385059 0.5713369
2 1 3 0.007112204 0.9607456 0.4839289
3 1 4 0.519281008 1.6451093 1.0821952
4 1 5 0.004955600 0.9128606 0.4589081
5 1 7 0.449101255 1.3225352 0.8858182
6 1 8 0.883576608 0.1783871 0.5309819
- Please explain the difference between the above two data frames you created sp_test1 and sp_test2. What is the difference between using & versus | in the filter() or subset() functions?
Answer: sp_test1
only includes records that sp2
= 17 and sp1
= 1 at the same time; while sp_test2
contains records that either sp2
= 17 or sp1
= 1. The difference between using & v.s. | is that & needs to meet both of the conditions while | only needs to meet one of the conditions.
5.2.2 Extra Credit (1 EXTRA POINT)
Your turn! Use what we’ve learned above to generate the ‘goal’ data frame that was shown at the beginning of this lab.
# Start from the original data
<- read.csv('../data/lab4/RY.csv')
RY <- read.csv('../data/lab4/sp_id.csv')
id <- read.csv('../data/lab4/sp_stoi_traits.csv')
trait
<- inner_join(trait, id, by=join_by(Species.name==sp))
trait.id
<- RY %>%
goal mutate(avgRY = (RY1 + RY2) / 2) %>%
inner_join(trait.id, by=c('sp1'='id')) %>%
inner_join(trait.id, by=c('sp2'='id'), suffix=c('.x', '.y'))
head(goal)
sp1 sp2 RY1 RY2 avgRY Species.name.x C.N.x
1 1 2 0.004167904 1.1385059 0.5713369 Ankistrodesmus falcatus 18.57258
2 1 3 0.007112204 0.9607456 0.4839289 Ankistrodesmus falcatus 18.57258
3 1 4 0.519281008 1.6451093 1.0821952 Ankistrodesmus falcatus 18.57258
4 1 5 0.004955600 0.9128606 0.4589081 Ankistrodesmus falcatus 18.57258
5 1 7 0.449101255 1.3225352 0.8858182 Ankistrodesmus falcatus 18.57258
6 1 8 0.883576608 0.1783871 0.5309819 Ankistrodesmus falcatus 18.57258
C.P.x N.P.x Species.name.y C.N.y C.P.y N.P.y
1 435.5213 23.4497 Botryococcus sudeticus 16.437039 365.5173 22.23742
2 435.5213 23.4497 Chlamydocapsa ampla 12.359818 259.7180 21.01310
3 435.5213 23.4497 Chlamydomonas moewusii 10.639614 161.1780 15.14885
4 435.5213 23.4497 Chlorella sorokiniana 7.129078 146.6461 20.57014
5 435.5213 23.4497 Closteriopsis acicularis 18.936076 363.5510 19.19885
6 435.5213 23.4497 Coelastrum microporum 10.896064 219.1580 20.11350