Data manipulation using dplyr

Author: Team BioSakshat

Last update: June 2017

Lets begin

Load dplyr library

library(dplyr);

## Warning: package 'dplyr' was built under R version 3.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load Gene Expression Data

gen=read.table("_site/data/Day2/GenExp_data.txt", header = TRUE, stringsAsFactors = FALSE);
head(gen);

##    Gene    D1    D2    D3    D4    D5 Path Genetype
## 1 Gene1  0.25  0.90 -0.59 -0.48  0.10   P1   Coding
## 2 Gene2 -0.12  1.66 -1.74 -0.43 -1.31   P1   Coding
## 3 Gene3 -1.34  0.04 -0.86 -0.59 -0.59   P1   Coding
## 4 Gene4  0.43 -1.69 -1.88 -0.39  0.10   P1   Coding
## 5 Gene5  1.04  1.04 -2.00 -2.44  1.07   P1   Coding
## 6 Gene6 -1.83  1.37 -0.01 -0.99  0.04   P1   Coding

dim(gen);

## [1] 200   8

Filter rows with filter()

filter() allows you to select a subset of rows in a data frame.

Filter coding genes.

temp=filter(gen, Genetype=="Coding");

## Warning: package 'bindrcpp' was built under R version 3.3.3

head(temp);

##    Gene    D1    D2    D3    D4    D5 Path Genetype
## 1 Gene1  0.25  0.90 -0.59 -0.48  0.10   P1   Coding
## 2 Gene2 -0.12  1.66 -1.74 -0.43 -1.31   P1   Coding
## 3 Gene3 -1.34  0.04 -0.86 -0.59 -0.59   P1   Coding
## 4 Gene4  0.43 -1.69 -1.88 -0.39  0.10   P1   Coding
## 5 Gene5  1.04  1.04 -2.00 -2.44  1.07   P1   Coding
## 6 Gene6 -1.83  1.37 -0.01 -0.99  0.04   P1   Coding

dim(temp);

## [1] 100   8

Q1) Filter only those genes which are in pathway P2. How many such genes are there

Filter those genes whose gene expression values on Day1 (D1) > 1.5 (up regulated).

filter(gen, D1> 1.5);

##       Gene   D1    D2    D3    D4    D5 Path  Genetype
## 1   Gene18 1.78 -0.07  1.28  0.69 -0.41   P1    Coding
## 2   Gene26 2.12  0.14  0.54 -0.43  1.88   P1    Coding
## 3   Gene30 1.69 -0.26  0.05 -0.29  0.80   P1    Coding
## 4   Gene36 1.83 -0.28 -1.08 -1.08  0.43   P1 Noncoding
## 5   Gene49 1.90  0.67  0.20  0.07  1.17   P1 Noncoding
## 6   Gene66 1.82 -0.08  0.61  1.99  0.17   P1 Noncoding
## 7  Gene106 3.16  0.77 -0.11  0.67 -0.37   P2    Coding
## 8  Gene111 1.99 -0.47  0.36  0.19 -0.83   P2    Coding
## 9  Gene134 1.99 -0.83  0.08 -0.31  1.92   P3    Coding
## 10 Gene135 1.66 -1.58  0.43  0.22 -0.91   P3    Coding
## 11 Gene145 2.24 -1.69  0.70 -0.94  1.63   P3    Coding
## 12 Gene173 1.63 -0.89  0.59  2.56 -0.41   P4    Coding
## 13 Gene185 1.75 -0.20 -0.96 -0.40 -0.40   P4 Noncoding
## 14 Gene199 1.76  1.02  0.06 -0.79  0.69   P4 Noncoding

Filter those genes whose gene expression values on Day1 (D1) > 1.5 and Day2 (D2) > 1.

filter(gen, D1> 1.5, D2>1);

##      Gene   D1   D2   D3    D4   D5 Path  Genetype
## 1 Gene199 1.76 1.02 0.06 -0.79 0.69   P4 Noncoding

Q2) Filter those genes whose expression on Day1 > 1.5 and belong to Pathway P2.

Filter genes whose D1 or D2 expression is > 1.5

# Using OR condition: |
temp=filter(gen, D1 > 1.5 | D2 > 1.5);
head(temp);

##     Gene    D1    D2    D3    D4    D5 Path  Genetype
## 1  Gene2 -0.12  1.66 -1.74 -0.43 -1.31   P1    Coding
## 2 Gene18  1.78 -0.07  1.28  0.69 -0.41   P1    Coding
## 3 Gene25 -2.09  2.12  0.11 -0.72 -0.17   P1    Coding
## 4 Gene26  2.12  0.14  0.54 -0.43  1.88   P1    Coding
## 5 Gene30  1.69 -0.26  0.05 -0.29  0.80   P1    Coding
## 6 Gene36  1.83 -0.28 -1.08 -1.08  0.43   P1 Noncoding

dim(temp);

## [1] 27  8

Filter genes from pathway P2 whose D1 or D2 expression is > 1.5

filter(gen, D1 > 1.5 | D2 > 1.5, Path=="P2")

##      Gene    D1    D2    D3    D4    D5 Path  Genetype
## 1  Gene75  0.05  2.33  1.25  0.79  1.75   P2 Noncoding
## 2  Gene81 -1.90  1.57 -1.05 -0.63 -0.26   P2 Noncoding
## 3 Gene106  3.16  0.77 -0.11  0.67 -0.37   P2    Coding
## 4 Gene109  0.55  1.65 -0.49  0.56 -0.66   P2    Coding
## 5 Gene111  1.99 -0.47  0.36  0.19 -0.83   P2    Coding

Piping using %>%

Piping minimizes the use of variables. It cleans the code. Results from left hand site is piped to right hand side.

How many genes are there in Pathway P2

filter(gen, Path=="P2") %>% nrow

## [1] 60

Filter those genes which below to pathway P2.
For the filtered genes, filter further whose D1 expression are > 1

filter(gen, Path=="P2") %>% filter(., D1 > 1) %>% nrow

## [1] 6

# . is optional if used as 1st argument
filter(gen, Path=="P2") %>% filter(D1 > 1) %>% nrow

## [1] 6

Arrange rows with arrange()

# Arrange genes first as per Pathway followed by expression values on D1
arrange(gen, Path, D1) %>% head

##     Gene    D1    D2    D3    D4    D5 Path  Genetype
## 1 Gene25 -2.09  2.12  0.11 -0.72 -0.17   P1    Coding
## 2  Gene6 -1.83  1.37 -0.01 -0.99  0.04   P1    Coding
## 3 Gene35 -1.62  0.44  2.04  3.35 -0.58   P1 Noncoding
## 4 Gene20 -1.57 -1.12  1.08  0.60 -0.64   P1    Coding
## 5  Gene3 -1.34  0.04 -0.86 -0.59 -0.59   P1    Coding
## 6 Gene68 -1.32  0.37  1.50 -0.08  1.04   P1 Noncoding

# Arrange genes first as per Pathway followed by expression values on D1 in descending order
arrange(gen, Path, desc(D1)) %>% head

##     Gene   D1    D2    D3    D4    D5 Path  Genetype
## 1 Gene26 2.12  0.14  0.54 -0.43  1.88   P1    Coding
## 2 Gene49 1.90  0.67  0.20  0.07  1.17   P1 Noncoding
## 3 Gene36 1.83 -0.28 -1.08 -1.08  0.43   P1 Noncoding
## 4 Gene66 1.82 -0.08  0.61  1.99  0.17   P1 Noncoding
## 5 Gene18 1.78 -0.07  1.28  0.69 -0.41   P1    Coding
## 6 Gene30 1.69 -0.26  0.05 -0.29  0.80   P1    Coding

Select rows position wise using slice()

# Select 25th to 30th row
slice(gen, 25:30);

## # A tibble: 6 x 8
##     Gene    D1    D2    D3    D4    D5  Path Genetype
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>    <chr>
## 1 Gene25 -2.09  2.12  0.11 -0.72 -0.17    P1   Coding
## 2 Gene26  2.12  0.14  0.54 -0.43  1.88    P1   Coding
## 3 Gene27  0.09 -0.78 -0.72 -0.21  0.47    P1   Coding
## 4 Gene28 -0.49 -0.92  1.56  0.59 -1.37    P1   Coding
## 5 Gene29  1.01 -0.72  0.07  0.32  0.99    P1   Coding
## 6 Gene30  1.69 -0.26  0.05 -0.29  0.80    P1   Coding

Select column using select()

# Select D1, D5 and Path column
select(gen, D1, D5, Path) %>% head;

##      D1    D5 Path
## 1  0.25  0.10   P1
## 2 -0.12 -1.31   P1
## 3 -1.34 -0.59   P1
## 4  0.43  0.10   P1
## 5  1.04  1.07   P1
## 6 -1.83  0.04   P1

# Exclude D2 column
select(gen, -D2) %>% head;

##    Gene    D1    D3    D4    D5 Path Genetype
## 1 Gene1  0.25 -0.59 -0.48  0.10   P1   Coding
## 2 Gene2 -0.12 -1.74 -0.43 -1.31   P1   Coding
## 3 Gene3 -1.34 -0.86 -0.59 -0.59   P1   Coding
## 4 Gene4  0.43 -1.88 -0.39  0.10   P1   Coding
## 5 Gene5  1.04 -2.00 -2.44  1.07   P1   Coding
## 6 Gene6 -1.83 -0.01 -0.99  0.04   P1   Coding

# Exclude D2, D5 column
select(gen, -c(D2,D5)) %>% head;

##    Gene    D1    D3    D4 Path Genetype
## 1 Gene1  0.25 -0.59 -0.48   P1   Coding
## 2 Gene2 -0.12 -1.74 -0.43   P1   Coding
## 3 Gene3 -1.34 -0.86 -0.59   P1   Coding
## 4 Gene4  0.43 -1.88 -0.39   P1   Coding
## 5 Gene5  1.04 -2.00 -2.44   P1   Coding
## 6 Gene6 -1.83 -0.01 -0.99   P1   Coding

# Column name starts with
select(gen, starts_with("D")) %>% head;

##      D1    D2    D3    D4    D5
## 1  0.25  0.90 -0.59 -0.48  0.10
## 2 -0.12  1.66 -1.74 -0.43 -1.31
## 3 -1.34  0.04 -0.86 -0.59 -0.59
## 4  0.43 -1.69 -1.88 -0.39  0.10
## 5  1.04  1.04 -2.00 -2.44  1.07
## 6 -1.83  1.37 -0.01 -0.99  0.04

# Column name ends with 3
select(gen, ends_with("3")) %>% head;

##      D3
## 1 -0.59
## 2 -1.74
## 3 -0.86
## 4 -1.88
## 5 -2.00
## 6 -0.01

# Column name contains "e"
select(gen, contains("e")) %>% head;

##    Gene Genetype
## 1 Gene1   Coding
## 2 Gene2   Coding
## 3 Gene3   Coding
## 4 Gene4   Coding
## 5 Gene5   Coding
## 6 Gene6   Coding

# Select all column from D1 to Path
select(gen, D1:Path) %>% head;

##      D1    D2    D3    D4    D5 Path
## 1  0.25  0.90 -0.59 -0.48  0.10   P1
## 2 -0.12  1.66 -1.74 -0.43 -1.31   P1
## 3 -1.34  0.04 -0.86 -0.59 -0.59   P1
## 4  0.43 -1.69 -1.88 -0.39  0.10   P1
## 5  1.04  1.04 -2.00 -2.44  1.07   P1
## 6 -1.83  1.37 -0.01 -0.99  0.04   P1

# Select 2nd and 4th column
select(gen, c(2,4)) %>% head

##      D1    D3
## 1  0.25 -0.59
## 2 -0.12 -1.74
## 3 -1.34 -0.86
## 4  0.43 -1.88
## 5  1.04 -2.00
## 6 -1.83 -0.01

Extract unique rows using distinct()

# Select unique entries in D1 column
select(gen, D1) %>% distinct %>% nrow

## [1] 154

# Select D2 and Path column then find unique rows
select(gen, D2, Path) %>% distinct %>% nrow

## [1] 191

Add new columns with mutate() and transmute() function

# mutate(gen,difD1D2=D1-D2): Adds a column named as “difD1D2”, which consists of difference between values of D1 and D2 gene expressions, to the existing dataset.
mutate(gen, diffD1D2 = D1-D2) %>% head

##    Gene    D1    D2    D3    D4    D5 Path Genetype diffD1D2
## 1 Gene1  0.25  0.90 -0.59 -0.48  0.10   P1   Coding    -0.65
## 2 Gene2 -0.12  1.66 -1.74 -0.43 -1.31   P1   Coding    -1.78
## 3 Gene3 -1.34  0.04 -0.86 -0.59 -0.59   P1   Coding    -1.38
## 4 Gene4  0.43 -1.69 -1.88 -0.39  0.10   P1   Coding     2.12
## 5 Gene5  1.04  1.04 -2.00 -2.44  1.07   P1   Coding     0.00
## 6 Gene6 -1.83  1.37 -0.01 -0.99  0.04   P1   Coding    -3.20

# mutate(gen,difD1D2=D1-D2,pdifD1D2=difD1D2*100) Use newly created column difD1D2 to create another pdifD1D2. So, you can create and reuse newly created columns.
mutate(gen, diffD1D2 = D1-D2, pdiffD1D2=diffD1D2*100) %>% head

##    Gene    D1    D2    D3    D4    D5 Path Genetype diffD1D2 pdiffD1D2
## 1 Gene1  0.25  0.90 -0.59 -0.48  0.10   P1   Coding    -0.65       -65
## 2 Gene2 -0.12  1.66 -1.74 -0.43 -1.31   P1   Coding    -1.78      -178
## 3 Gene3 -1.34  0.04 -0.86 -0.59 -0.59   P1   Coding    -1.38      -138
## 4 Gene4  0.43 -1.69 -1.88 -0.39  0.10   P1   Coding     2.12       212
## 5 Gene5  1.04  1.04 -2.00 -2.44  1.07   P1   Coding     0.00         0
## 6 Gene6 -1.83  1.37 -0.01 -0.99  0.04   P1   Coding    -3.20      -320

mutate(gen, diffD1D2 = D1-D2, pdiffD1D2=diffD1D2*100, meanexp=rowMeans(select(gen,D1,D2,D3,D4,D5))) %>% head

##    Gene    D1    D2    D3    D4    D5 Path Genetype diffD1D2 pdiffD1D2
## 1 Gene1  0.25  0.90 -0.59 -0.48  0.10   P1   Coding    -0.65       -65
## 2 Gene2 -0.12  1.66 -1.74 -0.43 -1.31   P1   Coding    -1.78      -178
## 3 Gene3 -1.34  0.04 -0.86 -0.59 -0.59   P1   Coding    -1.38      -138
## 4 Gene4  0.43 -1.69 -1.88 -0.39  0.10   P1   Coding     2.12       212
## 5 Gene5  1.04  1.04 -2.00 -2.44  1.07   P1   Coding     0.00         0
## 6 Gene6 -1.83  1.37 -0.01 -0.99  0.04   P1   Coding    -3.20      -320
##   meanexp
## 1   0.036
## 2  -0.388
## 3  -0.668
## 4  -0.686
## 5  -0.258
## 6  -0.284

# Just keeps the new columns only.
transmute(gen, difD1D2=D1-D2, pdifD1D2 = difD1D2 * 100) %>% head

##   difD1D2 pdifD1D2
## 1   -0.65      -65
## 2   -1.78     -178
## 3   -1.38     -138
## 4    2.12      212
## 5    0.00        0
## 6   -3.20     -320

Summarise values with summarise()

summarise(gen, D1mean = mean(D1), D2mean = mean(D2))

##   D1mean  D2mean
## 1  0.041 0.00235

Grouped operations

# Group genes as per pathway
gengrp=group_by(gen, Path);
# For every group then do some operation
summarise(gengrp, n())

## # A tibble: 4 x 2
##    Path `n()`
##   <chr> <int>
## 1    P1    70
## 2    P2    60
## 3    P3    40
## 4    P4    30

summarise(gengrp, mean(D1), mean(D2), mean(D3))

## # A tibble: 4 x 4
##    Path  `mean(D1)` `mean(D2)` `mean(D3)`
##   <chr>       <dbl>      <dbl>      <dbl>
## 1    P1  0.04528571    -0.0510 -0.1077143
## 2    P2 -0.04500000     0.0115 -0.1471667
## 3    P3  0.15225000    -0.1180 -0.1107500
## 4    P4  0.05466667     0.2690 -0.2690000

summarise(gengrp, mD1=mean(D1), mD2=mean(D2), mD3=mean(D3)) %>% filter(.,mD1>0.05) %>% select(.,Path,mD1);

## # A tibble: 2 x 2
##    Path        mD1
##   <chr>      <dbl>
## 1    P3 0.15225000
## 2    P4 0.05466667

summarise(gengrp, OverExp_Genecount_D1=sum(D1>1.5), OverExp_Genecount_D2=sum(D2>1.5), OverExp_Genecount_D3=sum(D3>1.5))

## # A tibble: 4 x 4
##    Path OverExp_Genecount_D1 OverExp_Genecount_D2 OverExp_Genecount_D3
##   <chr>                <int>                <int>                <int>
## 1    P1                    6                    3                    7
## 2    P2                    2                    3                    4
## 3    P3                    3                    2                    0
## 4    P4                    3                    5                    1

Joins two table

Currently dplyr supports following join types:

inner_join()
left_join()
right_join()
semi_join()
anti_join()
full_join()

Create Pathway information data frame which we will use to demonstrate join

# Select one gene from each pathway, for demo purpose
tempgen = group_by(gen,Path) %>% slice(1);
# Create Pathway information. Note that we dont have pathway information for P4
Pathwayinfo=data.frame(Path=c("P1","P2","P3","P5"), Name=c("Glycolysis","Gluconeogenesis","Purine metabolism","Apotopsis"), ID=c("ID0752","ID3251","ID2567","ID4568"));
Pathwayinfo;

##   Path              Name     ID
## 1   P1        Glycolysis ID0752
## 2   P2   Gluconeogenesis ID3251
## 3   P3 Purine metabolism ID2567
## 4   P5         Apotopsis ID4568

tempgen;

## # A tibble: 4 x 8
## # Groups:   Path [4]
##      Gene    D1    D2    D3    D4    D5  Path  Genetype
##     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>     <chr>
## 1   Gene1  0.25  0.90 -0.59 -0.48  0.10    P1    Coding
## 2  Gene71 -1.05  0.84 -1.43  0.65 -1.05    P2 Noncoding
## 3 Gene131  0.59  0.00  0.51  0.20  0.33    P3    Coding
## 4 Gene171  1.44 -0.31 -0.56 -1.09 -0.76    P4    Coding

Pathwayinfo;

##   Path              Name     ID
## 1   P1        Glycolysis ID0752
## 2   P2   Gluconeogenesis ID3251
## 3   P3 Purine metabolism ID2567
## 4   P5         Apotopsis ID4568

# Return all rows from x where there are matching values in y (i.e. matching x rows)
# Return all columns from x and y. 
# If there are multiple matches between x and y, all combination of the matches are returned.
inner_join(tempgen, Pathwayinfo, by="Path");

## Warning: Column `Path` joining character vector and factor, coercing into
## character vector

## # A tibble: 3 x 10
## # Groups:   Path [?]
##      Gene    D1    D2    D3    D4    D5  Path  Genetype              Name
##     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>     <chr>            <fctr>
## 1   Gene1  0.25  0.90 -0.59 -0.48  0.10    P1    Coding        Glycolysis
## 2  Gene71 -1.05  0.84 -1.43  0.65 -1.05    P2 Noncoding   Gluconeogenesis
## 3 Gene131  0.59  0.00  0.51  0.20  0.33    P3    Coding Purine metabolism
## # ... with 1 more variables: ID <fctr>

# Return all rows from x
# All columns from x and y. 
# Rows in x with no match in y will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned.
left_join(tempgen, Pathwayinfo, by="Path");

## Warning: Column `Path` joining character vector and factor, coercing into
## character vector

## # A tibble: 4 x 10
## # Groups:   Path [?]
##      Gene    D1    D2    D3    D4    D5  Path  Genetype              Name
##     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>     <chr>            <fctr>
## 1   Gene1  0.25  0.90 -0.59 -0.48  0.10    P1    Coding        Glycolysis
## 2  Gene71 -1.05  0.84 -1.43  0.65 -1.05    P2 Noncoding   Gluconeogenesis
## 3 Gene131  0.59  0.00  0.51  0.20  0.33    P3    Coding Purine metabolism
## 4 Gene171  1.44 -0.31 -0.56 -1.09 -0.76    P4    Coding              <NA>
## # ... with 1 more variables: ID <fctr>

# Return all rows from y
# all columns from x and y. 
# Rows in y with no match in x will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned.
right_join(tempgen, Pathwayinfo, by="Path");

## Warning: Column `Path` joining character vector and factor, coercing into
## character vector

## # A tibble: 4 x 10
## # Groups:   Path [?]
##      Gene    D1    D2    D3    D4    D5  Path  Genetype              Name
##     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>     <chr>            <fctr>
## 1   Gene1  0.25  0.90 -0.59 -0.48  0.10    P1    Coding        Glycolysis
## 2  Gene71 -1.05  0.84 -1.43  0.65 -1.05    P2 Noncoding   Gluconeogenesis
## 3 Gene131  0.59  0.00  0.51  0.20  0.33    P3    Coding Purine metabolism
## 4    <NA>    NA    NA    NA    NA    NA    P5      <NA>         Apotopsis
## # ... with 1 more variables: ID <fctr>

# Return all rows and all columns from both x and y. 
# Where there are not matching values, returns NA for the one missing.
full_join(tempgen, Pathwayinfo, by="Path");

## Warning: Column `Path` joining character vector and factor, coercing into
## character vector

## # A tibble: 5 x 10
## # Groups:   Path [?]
##      Gene    D1    D2    D3    D4    D5  Path  Genetype              Name
##     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>     <chr>            <fctr>
## 1   Gene1  0.25  0.90 -0.59 -0.48  0.10    P1    Coding        Glycolysis
## 2  Gene71 -1.05  0.84 -1.43  0.65 -1.05    P2 Noncoding   Gluconeogenesis
## 3 Gene131  0.59  0.00  0.51  0.20  0.33    P3    Coding Purine metabolism
## 4 Gene171  1.44 -0.31 -0.56 -1.09 -0.76    P4    Coding              <NA>
## 5    <NA>    NA    NA    NA    NA    NA    P5      <NA>         Apotopsis
## # ... with 1 more variables: ID <fctr>

# Return all rows from x where there are matching values in y, keeping just columns from x.
semi_join(tempgen, Pathwayinfo, by="Path");

## Warning: Column `Path` joining character vector and factor, coercing into
## character vector

## # A tibble: 3 x 8
## # Groups:   Path [?]
##      Gene    D1    D2    D3    D4    D5  Path  Genetype
##     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>     <chr>
## 1   Gene1  0.25  0.90 -0.59 -0.48  0.10    P1    Coding
## 2  Gene71 -1.05  0.84 -1.43  0.65 -1.05    P2 Noncoding
## 3 Gene131  0.59  0.00  0.51  0.20  0.33    P3    Coding

# Return all rows from x where there are not matching values in y, keeping just columns from x.
anti_join(tempgen, Pathwayinfo, by="Path");

## Warning: Column `Path` joining character vector and factor, coercing into
## character vector

## # A tibble: 1 x 8
## # Groups:   Path [?]
##      Gene    D1    D2    D3    D4    D5  Path Genetype
##     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>    <chr>
## 1 Gene171  1.44 -0.31 -0.56 -1.09 -0.76    P4   Coding

References

https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html