Lab 4: Data Wrangling

Author

Xiuyu Cao

Published

February 14, 2024

1 Lab Setup

library(dplyr)
library(tidyr)

# data readin
RY <- read.csv('../data/lab4/RY.csv')
id <- read.csv('../data/lab4/sp_id.csv')
trait <- read.csv('../data/lab4/sp_stoi_traits.csv')
# check the data
head(RY, 3)  # different species and relative yield of the species (compared to grow alone)
  sp1 sp2         RY1       RY2
1   1   2 0.004167904 1.1385059
2   1   3 0.007112204 0.9607456
3   1   4 0.519281008 1.6451093
head(id, 3)  # species names for each ID
                       sp id
1 Ankistrodesmus falcatus  1
2  Botryococcus sudeticus  2
3     Chlamydocapsa ampla  3
head(trait, 3)  # ratio of a speices element (C for carbon, N for nitrogen, and P for phosphorous)
             Species.name      C.N      C.P      N.P
1 Actinastrum hantzschii  10.53468 235.5864 22.36293
2 Ankistrodesmus falcatus 18.57258 435.5213 23.44970
3 Arthrodesmus convergens 15.25011 481.3848 31.56598

2 Mutate

2.1 Exercise 1

head(RY, 3)
  sp1 sp2         RY1       RY2
1   1   2 0.004167904 1.1385059
2   1   3 0.007112204 0.9607456
3   1   4 0.519281008 1.6451093
RY <- mutate(RY, avgRY = (RY1 + RY2) / 2)
head(RY, 3)
  sp1 sp2         RY1       RY2     avgRY
1   1   2 0.004167904 1.1385059 0.5713369
2   1   3 0.007112204 0.9607456 0.4839289
3   1   4 0.519281008 1.6451093 1.0821952

2.1.1 Question 1.1

What did we just do using the mutate() function? What column was added? What does the column represent?

Answer: We used the mutate() to create a new column avgRY, which is the mean of the RY1 and RY2.

2.1.2 Question 1.2

Please use the mutate function to create a new column that is the sum of RY1 and RY2. Name this new column RYTotal and name this new dataframe RY.t. Please do not add this new column to the RY dataset because if you do you will run into problems later in the lab!

RY.t <- RY %>%
  mutate(RYTotal=RY1+RY2)
head(RY.t)
  sp1 sp2         RY1       RY2     avgRY   RYTotal
1   1   2 0.004167904 1.1385059 0.5713369 1.1426738
2   1   3 0.007112204 0.9607456 0.4839289 0.9678578
3   1   4 0.519281008 1.6451093 1.0821952 2.1643903
4   1   5 0.004955600 0.9128606 0.4589081 0.9178162
5   1   7 0.449101255 1.3225352 0.8858182 1.7716365
6   1   8 0.883576608 0.1783871 0.5309819 1.0619637

3 Merge/join

3.1 Practice Work

# join the RY and the id dataframe, get the species name
colnames(id)[2] <- "sp1"  # This is a very complicated way of joining
RY.m <- left_join(RY, id, by = "sp1")
colnames(RY.m)[6] <- "sp1_name"
colnames(id)[2] <- "sp2"
RY.m <- left_join(RY.m, id, by = "sp2")
colnames(RY.m)[7] <- "sp2_name"

# left join the RY.m and the trait dataframe
colnames(trait)[1] <- "sp1_name"
RY.mtl <- left_join(RY.m, trait, by = "sp1_name")
mean(RY.mtl[, "C.N"], na.rm = TRUE)
[1] 13.92414
# inner join the RY.m and the trait dataframe
colnames(trait)[1] <- "sp1_name"
RY.mti <- inner_join(RY.m, trait, by = "sp1_name")
mean(RY.mti[, "C.N"])
[1] 13.92414

3.2 Exercise 2

3.2.1 Question 2.1

Please use right_join to join RY.m and trait by sp1_name and name this dataframe RY.mtr (please add RY.m and trait in the same order as you did in the two joins above).

RY.mtr <- right_join(RY.m, trait, by = 'sp1_name')

How does this dataframe compare to the left_join dataframe RY.mtl and the inner_join dataframe RY.mti?

Answer: The right_join() result retains all observations in y (i.e. trait in this case). Therefore, some species only exist in trait but not in the RY.m remains in the result, leading to the many ‘NA’ records in the RY.mtr. The inner_join() only keeps the observations both in x and y, thus giving the shortest result. And the left_join() keeps all observations in x.

tail(RY.mtr)
    sp1 sp2 RY1 RY2 avgRY                sp1_name sp2_name       C.N       C.P
207  NA  NA  NA  NA    NA    Staurastrum cingulum     <NA> 13.959753        NA
208  NA  NA  NA  NA    NA Staurastrum punctulatum     <NA> 12.948858        NA
209  NA  NA  NA  NA    NA  Staurastrum tetracerum     <NA> 13.965427 258.95257
210  NA  NA  NA  NA    NA      Tetraedron minutum     <NA> 16.242217 474.40228
211  NA  NA  NA  NA    NA Tetrastrum?heteracantum     <NA> 14.211187 293.12711
212  NA  NA  NA  NA    NA          Volvox tertius     <NA>  8.166389  96.29642
         N.P
207       NA
208       NA
209 18.54240
210 29.20798
211 20.62650
212 11.79180

4 Gather & Spread

4.1 Practice Work

# from wide to long
RY.long <- gather(RY, key = RY_type, value = value, RY1:avgRY)
head(RY.long, 3)
  sp1 sp2 RY_type       value
1   1   2     RY1 0.004167904
2   1   3     RY1 0.007112204
3   1   4     RY1 0.519281008
# from long to wide
RY.wide <- spread(RY.long, key = RY_type, value = value)
head(RY.wide, 3)
  sp1 sp2     avgRY         RY1       RY2
1   1   2 0.5713369 0.004167904 1.1385059
2   1   3 0.4839289 0.007112204 0.9607456
3   1   4 1.0821952 0.519281008 1.6451093

5 Subset/filter

5.1 Exercise 3

5.1.1 Question 3.1

Please use the subset function to subset your data for species number 17 in the species 2 column (sp2). Name this subset sp17. Now subset this data frame, sp17, to only include the column RY2.

sp17 <- subset(RY, sp2 == 17)
head(sp17, 3)
   sp1 sp2         RY1       RY2     avgRY
14   1  17 0.007949608 1.0448059 0.5263777
25   2  17 0.808653223 0.3098087 0.5592310
38   3  17 0.113828498 0.7334857 0.4236571
sp17 <- subset(sp17, select=c(RY2))
head(sp17, 3)
         RY2
14 1.0448059
25 0.3098087
38 0.7334857

5.2 Exercise 4

5.2.1 Question 4.1

sp_test1 <- filter(RY, sp2 == 17 & sp1 == 1)
head(sp_test1)
  sp1 sp2         RY1      RY2     avgRY
1   1  17 0.007949608 1.044806 0.5263777
sp_test2 <- filter(RY, sp2 == 17 | sp1 == 1)
head(sp_test2)
  sp1 sp2         RY1       RY2     avgRY
1   1   2 0.004167904 1.1385059 0.5713369
2   1   3 0.007112204 0.9607456 0.4839289
3   1   4 0.519281008 1.6451093 1.0821952
4   1   5 0.004955600 0.9128606 0.4589081
5   1   7 0.449101255 1.3225352 0.8858182
6   1   8 0.883576608 0.1783871 0.5309819
  1. Please explain the difference between the above two data frames you created sp_test1 and sp_test2. What is the difference between using & versus | in the filter() or subset() functions?

Answer: sp_test1 only includes records that sp2 = 17 and sp1 = 1 at the same time; while sp_test2 contains records that either sp2 = 17 or sp1 = 1. The difference between using & v.s. | is that & needs to meet both of the conditions while | only needs to meet one of the conditions.

5.2.2 Extra Credit (1 EXTRA POINT)

Your turn! Use what we’ve learned above to generate the ‘goal’ data frame that was shown at the beginning of this lab.

# Start from the original data
RY <- read.csv('../data/lab4/RY.csv')
id <- read.csv('../data/lab4/sp_id.csv')
trait <- read.csv('../data/lab4/sp_stoi_traits.csv')

trait.id <- inner_join(trait, id, by=join_by(Species.name==sp))

goal <- RY %>%
  mutate(avgRY = (RY1 + RY2) / 2) %>%
  inner_join(trait.id, by=c('sp1'='id')) %>%
  inner_join(trait.id, by=c('sp2'='id'), suffix=c('.x', '.y'))
head(goal)
  sp1 sp2         RY1       RY2     avgRY          Species.name.x    C.N.x
1   1   2 0.004167904 1.1385059 0.5713369 Ankistrodesmus falcatus 18.57258
2   1   3 0.007112204 0.9607456 0.4839289 Ankistrodesmus falcatus 18.57258
3   1   4 0.519281008 1.6451093 1.0821952 Ankistrodesmus falcatus 18.57258
4   1   5 0.004955600 0.9128606 0.4589081 Ankistrodesmus falcatus 18.57258
5   1   7 0.449101255 1.3225352 0.8858182 Ankistrodesmus falcatus 18.57258
6   1   8 0.883576608 0.1783871 0.5309819 Ankistrodesmus falcatus 18.57258
     C.P.x   N.P.x           Species.name.y     C.N.y    C.P.y    N.P.y
1 435.5213 23.4497   Botryococcus sudeticus 16.437039 365.5173 22.23742
2 435.5213 23.4497      Chlamydocapsa ampla 12.359818 259.7180 21.01310
3 435.5213 23.4497   Chlamydomonas moewusii 10.639614 161.1780 15.14885
4 435.5213 23.4497    Chlorella sorokiniana  7.129078 146.6461 20.57014
5 435.5213 23.4497 Closteriopsis acicularis 18.936076 363.5510 19.19885
6 435.5213 23.4497    Coelastrum microporum 10.896064 219.1580 20.11350