Author: Team BioSakshat
Last update: June 2017
Copyright © 2017 BioSakshat, Inc. All rights reserved.
library(dplyr);
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
gen=read.table("_site/data/Day2/GenExp_data.txt", header = TRUE, stringsAsFactors = FALSE);
head(gen);
## Gene D1 D2 D3 D4 D5 Path Genetype
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding
## 2 Gene2 -0.12 1.66 -1.74 -0.43 -1.31 P1 Coding
## 3 Gene3 -1.34 0.04 -0.86 -0.59 -0.59 P1 Coding
## 4 Gene4 0.43 -1.69 -1.88 -0.39 0.10 P1 Coding
## 5 Gene5 1.04 1.04 -2.00 -2.44 1.07 P1 Coding
## 6 Gene6 -1.83 1.37 -0.01 -0.99 0.04 P1 Coding
dim(gen);
## [1] 200 8
filter() allows you to select a subset of rows in a data frame.
Filter coding genes.
temp=filter(gen, Genetype=="Coding");
## Warning: package 'bindrcpp' was built under R version 3.3.3
head(temp);
## Gene D1 D2 D3 D4 D5 Path Genetype
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding
## 2 Gene2 -0.12 1.66 -1.74 -0.43 -1.31 P1 Coding
## 3 Gene3 -1.34 0.04 -0.86 -0.59 -0.59 P1 Coding
## 4 Gene4 0.43 -1.69 -1.88 -0.39 0.10 P1 Coding
## 5 Gene5 1.04 1.04 -2.00 -2.44 1.07 P1 Coding
## 6 Gene6 -1.83 1.37 -0.01 -0.99 0.04 P1 Coding
dim(temp);
## [1] 100 8
Q1) Filter only those genes which are in pathway P2. How many such genes are there
Filter those genes whose gene expression values on Day1 (D1) > 1.5 (up regulated).
filter(gen, D1> 1.5);
## Gene D1 D2 D3 D4 D5 Path Genetype
## 1 Gene18 1.78 -0.07 1.28 0.69 -0.41 P1 Coding
## 2 Gene26 2.12 0.14 0.54 -0.43 1.88 P1 Coding
## 3 Gene30 1.69 -0.26 0.05 -0.29 0.80 P1 Coding
## 4 Gene36 1.83 -0.28 -1.08 -1.08 0.43 P1 Noncoding
## 5 Gene49 1.90 0.67 0.20 0.07 1.17 P1 Noncoding
## 6 Gene66 1.82 -0.08 0.61 1.99 0.17 P1 Noncoding
## 7 Gene106 3.16 0.77 -0.11 0.67 -0.37 P2 Coding
## 8 Gene111 1.99 -0.47 0.36 0.19 -0.83 P2 Coding
## 9 Gene134 1.99 -0.83 0.08 -0.31 1.92 P3 Coding
## 10 Gene135 1.66 -1.58 0.43 0.22 -0.91 P3 Coding
## 11 Gene145 2.24 -1.69 0.70 -0.94 1.63 P3 Coding
## 12 Gene173 1.63 -0.89 0.59 2.56 -0.41 P4 Coding
## 13 Gene185 1.75 -0.20 -0.96 -0.40 -0.40 P4 Noncoding
## 14 Gene199 1.76 1.02 0.06 -0.79 0.69 P4 Noncoding
Filter those genes whose gene expression values on Day1 (D1) > 1.5 and Day2 (D2) > 1.
filter(gen, D1> 1.5, D2>1);
## Gene D1 D2 D3 D4 D5 Path Genetype
## 1 Gene199 1.76 1.02 0.06 -0.79 0.69 P4 Noncoding
Q2) Filter those genes whose expression on Day1 > 1.5 and belong to Pathway P2.
Filter genes whose D1 or D2 expression is > 1.5
# Using OR condition: |
temp=filter(gen, D1 > 1.5 | D2 > 1.5);
head(temp);
## Gene D1 D2 D3 D4 D5 Path Genetype
## 1 Gene2 -0.12 1.66 -1.74 -0.43 -1.31 P1 Coding
## 2 Gene18 1.78 -0.07 1.28 0.69 -0.41 P1 Coding
## 3 Gene25 -2.09 2.12 0.11 -0.72 -0.17 P1 Coding
## 4 Gene26 2.12 0.14 0.54 -0.43 1.88 P1 Coding
## 5 Gene30 1.69 -0.26 0.05 -0.29 0.80 P1 Coding
## 6 Gene36 1.83 -0.28 -1.08 -1.08 0.43 P1 Noncoding
dim(temp);
## [1] 27 8
Filter genes from pathway P2 whose D1 or D2 expression is > 1.5
filter(gen, D1 > 1.5 | D2 > 1.5, Path=="P2")
## Gene D1 D2 D3 D4 D5 Path Genetype
## 1 Gene75 0.05 2.33 1.25 0.79 1.75 P2 Noncoding
## 2 Gene81 -1.90 1.57 -1.05 -0.63 -0.26 P2 Noncoding
## 3 Gene106 3.16 0.77 -0.11 0.67 -0.37 P2 Coding
## 4 Gene109 0.55 1.65 -0.49 0.56 -0.66 P2 Coding
## 5 Gene111 1.99 -0.47 0.36 0.19 -0.83 P2 Coding
Piping minimizes the use of variables. It cleans the code. Results from left hand site is piped to right hand side.
How many genes are there in Pathway P2
filter(gen, Path=="P2") %>% nrow
## [1] 60
filter(gen, Path=="P2") %>% filter(., D1 > 1) %>% nrow
## [1] 6
# . is optional if used as 1st argument
filter(gen, Path=="P2") %>% filter(D1 > 1) %>% nrow
## [1] 6
# Arrange genes first as per Pathway followed by expression values on D1
arrange(gen, Path, D1) %>% head
## Gene D1 D2 D3 D4 D5 Path Genetype
## 1 Gene25 -2.09 2.12 0.11 -0.72 -0.17 P1 Coding
## 2 Gene6 -1.83 1.37 -0.01 -0.99 0.04 P1 Coding
## 3 Gene35 -1.62 0.44 2.04 3.35 -0.58 P1 Noncoding
## 4 Gene20 -1.57 -1.12 1.08 0.60 -0.64 P1 Coding
## 5 Gene3 -1.34 0.04 -0.86 -0.59 -0.59 P1 Coding
## 6 Gene68 -1.32 0.37 1.50 -0.08 1.04 P1 Noncoding
# Arrange genes first as per Pathway followed by expression values on D1 in descending order
arrange(gen, Path, desc(D1)) %>% head
## Gene D1 D2 D3 D4 D5 Path Genetype
## 1 Gene26 2.12 0.14 0.54 -0.43 1.88 P1 Coding
## 2 Gene49 1.90 0.67 0.20 0.07 1.17 P1 Noncoding
## 3 Gene36 1.83 -0.28 -1.08 -1.08 0.43 P1 Noncoding
## 4 Gene66 1.82 -0.08 0.61 1.99 0.17 P1 Noncoding
## 5 Gene18 1.78 -0.07 1.28 0.69 -0.41 P1 Coding
## 6 Gene30 1.69 -0.26 0.05 -0.29 0.80 P1 Coding
# Select 25th to 30th row
slice(gen, 25:30);
## # A tibble: 6 x 8
## Gene D1 D2 D3 D4 D5 Path Genetype
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Gene25 -2.09 2.12 0.11 -0.72 -0.17 P1 Coding
## 2 Gene26 2.12 0.14 0.54 -0.43 1.88 P1 Coding
## 3 Gene27 0.09 -0.78 -0.72 -0.21 0.47 P1 Coding
## 4 Gene28 -0.49 -0.92 1.56 0.59 -1.37 P1 Coding
## 5 Gene29 1.01 -0.72 0.07 0.32 0.99 P1 Coding
## 6 Gene30 1.69 -0.26 0.05 -0.29 0.80 P1 Coding
# Select D1, D5 and Path column
select(gen, D1, D5, Path) %>% head;
## D1 D5 Path
## 1 0.25 0.10 P1
## 2 -0.12 -1.31 P1
## 3 -1.34 -0.59 P1
## 4 0.43 0.10 P1
## 5 1.04 1.07 P1
## 6 -1.83 0.04 P1
# Exclude D2 column
select(gen, -D2) %>% head;
## Gene D1 D3 D4 D5 Path Genetype
## 1 Gene1 0.25 -0.59 -0.48 0.10 P1 Coding
## 2 Gene2 -0.12 -1.74 -0.43 -1.31 P1 Coding
## 3 Gene3 -1.34 -0.86 -0.59 -0.59 P1 Coding
## 4 Gene4 0.43 -1.88 -0.39 0.10 P1 Coding
## 5 Gene5 1.04 -2.00 -2.44 1.07 P1 Coding
## 6 Gene6 -1.83 -0.01 -0.99 0.04 P1 Coding
# Exclude D2, D5 column
select(gen, -c(D2,D5)) %>% head;
## Gene D1 D3 D4 Path Genetype
## 1 Gene1 0.25 -0.59 -0.48 P1 Coding
## 2 Gene2 -0.12 -1.74 -0.43 P1 Coding
## 3 Gene3 -1.34 -0.86 -0.59 P1 Coding
## 4 Gene4 0.43 -1.88 -0.39 P1 Coding
## 5 Gene5 1.04 -2.00 -2.44 P1 Coding
## 6 Gene6 -1.83 -0.01 -0.99 P1 Coding
# Column name starts with
select(gen, starts_with("D")) %>% head;
## D1 D2 D3 D4 D5
## 1 0.25 0.90 -0.59 -0.48 0.10
## 2 -0.12 1.66 -1.74 -0.43 -1.31
## 3 -1.34 0.04 -0.86 -0.59 -0.59
## 4 0.43 -1.69 -1.88 -0.39 0.10
## 5 1.04 1.04 -2.00 -2.44 1.07
## 6 -1.83 1.37 -0.01 -0.99 0.04
# Column name ends with 3
select(gen, ends_with("3")) %>% head;
## D3
## 1 -0.59
## 2 -1.74
## 3 -0.86
## 4 -1.88
## 5 -2.00
## 6 -0.01
# Column name contains "e"
select(gen, contains("e")) %>% head;
## Gene Genetype
## 1 Gene1 Coding
## 2 Gene2 Coding
## 3 Gene3 Coding
## 4 Gene4 Coding
## 5 Gene5 Coding
## 6 Gene6 Coding
# Select all column from D1 to Path
select(gen, D1:Path) %>% head;
## D1 D2 D3 D4 D5 Path
## 1 0.25 0.90 -0.59 -0.48 0.10 P1
## 2 -0.12 1.66 -1.74 -0.43 -1.31 P1
## 3 -1.34 0.04 -0.86 -0.59 -0.59 P1
## 4 0.43 -1.69 -1.88 -0.39 0.10 P1
## 5 1.04 1.04 -2.00 -2.44 1.07 P1
## 6 -1.83 1.37 -0.01 -0.99 0.04 P1
# Select 2nd and 4th column
select(gen, c(2,4)) %>% head
## D1 D3
## 1 0.25 -0.59
## 2 -0.12 -1.74
## 3 -1.34 -0.86
## 4 0.43 -1.88
## 5 1.04 -2.00
## 6 -1.83 -0.01
# Select unique entries in D1 column
select(gen, D1) %>% distinct %>% nrow
## [1] 154
# Select D2 and Path column then find unique rows
select(gen, D2, Path) %>% distinct %>% nrow
## [1] 191
# mutate(gen,difD1D2=D1-D2): Adds a column named as “difD1D2”, which consists of difference between values of D1 and D2 gene expressions, to the existing dataset.
mutate(gen, diffD1D2 = D1-D2) %>% head
## Gene D1 D2 D3 D4 D5 Path Genetype diffD1D2
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding -0.65
## 2 Gene2 -0.12 1.66 -1.74 -0.43 -1.31 P1 Coding -1.78
## 3 Gene3 -1.34 0.04 -0.86 -0.59 -0.59 P1 Coding -1.38
## 4 Gene4 0.43 -1.69 -1.88 -0.39 0.10 P1 Coding 2.12
## 5 Gene5 1.04 1.04 -2.00 -2.44 1.07 P1 Coding 0.00
## 6 Gene6 -1.83 1.37 -0.01 -0.99 0.04 P1 Coding -3.20
# mutate(gen,difD1D2=D1-D2,pdifD1D2=difD1D2*100) Use newly created column difD1D2 to create another pdifD1D2. So, you can create and reuse newly created columns.
mutate(gen, diffD1D2 = D1-D2, pdiffD1D2=diffD1D2*100) %>% head
## Gene D1 D2 D3 D4 D5 Path Genetype diffD1D2 pdiffD1D2
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding -0.65 -65
## 2 Gene2 -0.12 1.66 -1.74 -0.43 -1.31 P1 Coding -1.78 -178
## 3 Gene3 -1.34 0.04 -0.86 -0.59 -0.59 P1 Coding -1.38 -138
## 4 Gene4 0.43 -1.69 -1.88 -0.39 0.10 P1 Coding 2.12 212
## 5 Gene5 1.04 1.04 -2.00 -2.44 1.07 P1 Coding 0.00 0
## 6 Gene6 -1.83 1.37 -0.01 -0.99 0.04 P1 Coding -3.20 -320
mutate(gen, diffD1D2 = D1-D2, pdiffD1D2=diffD1D2*100, meanexp=rowMeans(select(gen,D1,D2,D3,D4,D5))) %>% head
## Gene D1 D2 D3 D4 D5 Path Genetype diffD1D2 pdiffD1D2
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding -0.65 -65
## 2 Gene2 -0.12 1.66 -1.74 -0.43 -1.31 P1 Coding -1.78 -178
## 3 Gene3 -1.34 0.04 -0.86 -0.59 -0.59 P1 Coding -1.38 -138
## 4 Gene4 0.43 -1.69 -1.88 -0.39 0.10 P1 Coding 2.12 212
## 5 Gene5 1.04 1.04 -2.00 -2.44 1.07 P1 Coding 0.00 0
## 6 Gene6 -1.83 1.37 -0.01 -0.99 0.04 P1 Coding -3.20 -320
## meanexp
## 1 0.036
## 2 -0.388
## 3 -0.668
## 4 -0.686
## 5 -0.258
## 6 -0.284
# Just keeps the new columns only.
transmute(gen, difD1D2=D1-D2, pdifD1D2 = difD1D2 * 100) %>% head
## difD1D2 pdifD1D2
## 1 -0.65 -65
## 2 -1.78 -178
## 3 -1.38 -138
## 4 2.12 212
## 5 0.00 0
## 6 -3.20 -320
summarise(gen, D1mean = mean(D1), D2mean = mean(D2))
## D1mean D2mean
## 1 0.041 0.00235
# Group genes as per pathway
gengrp=group_by(gen, Path);
# For every group then do some operation
summarise(gengrp, n())
## # A tibble: 4 x 2
## Path `n()`
## <chr> <int>
## 1 P1 70
## 2 P2 60
## 3 P3 40
## 4 P4 30
summarise(gengrp, mean(D1), mean(D2), mean(D3))
## # A tibble: 4 x 4
## Path `mean(D1)` `mean(D2)` `mean(D3)`
## <chr> <dbl> <dbl> <dbl>
## 1 P1 0.04528571 -0.0510 -0.1077143
## 2 P2 -0.04500000 0.0115 -0.1471667
## 3 P3 0.15225000 -0.1180 -0.1107500
## 4 P4 0.05466667 0.2690 -0.2690000
summarise(gengrp, mD1=mean(D1), mD2=mean(D2), mD3=mean(D3)) %>% filter(.,mD1>0.05) %>% select(.,Path,mD1);
## # A tibble: 2 x 2
## Path mD1
## <chr> <dbl>
## 1 P3 0.15225000
## 2 P4 0.05466667
summarise(gengrp, OverExp_Genecount_D1=sum(D1>1.5), OverExp_Genecount_D2=sum(D2>1.5), OverExp_Genecount_D3=sum(D3>1.5))
## # A tibble: 4 x 4
## Path OverExp_Genecount_D1 OverExp_Genecount_D2 OverExp_Genecount_D3
## <chr> <int> <int> <int>
## 1 P1 6 3 7
## 2 P2 2 3 4
## 3 P3 3 2 0
## 4 P4 3 5 1
Currently dplyr supports following join types:
Create Pathway information data frame which we will use to demonstrate join
# Select one gene from each pathway, for demo purpose
tempgen = group_by(gen,Path) %>% slice(1);
# Create Pathway information. Note that we dont have pathway information for P4
Pathwayinfo=data.frame(Path=c("P1","P2","P3","P5"), Name=c("Glycolysis","Gluconeogenesis","Purine metabolism","Apotopsis"), ID=c("ID0752","ID3251","ID2567","ID4568"));
Pathwayinfo;
## Path Name ID
## 1 P1 Glycolysis ID0752
## 2 P2 Gluconeogenesis ID3251
## 3 P3 Purine metabolism ID2567
## 4 P5 Apotopsis ID4568
tempgen;
## # A tibble: 4 x 8
## # Groups: Path [4]
## Gene D1 D2 D3 D4 D5 Path Genetype
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding
## 2 Gene71 -1.05 0.84 -1.43 0.65 -1.05 P2 Noncoding
## 3 Gene131 0.59 0.00 0.51 0.20 0.33 P3 Coding
## 4 Gene171 1.44 -0.31 -0.56 -1.09 -0.76 P4 Coding
Pathwayinfo;
## Path Name ID
## 1 P1 Glycolysis ID0752
## 2 P2 Gluconeogenesis ID3251
## 3 P3 Purine metabolism ID2567
## 4 P5 Apotopsis ID4568
# Return all rows from x where there are matching values in y (i.e. matching x rows)
# Return all columns from x and y.
# If there are multiple matches between x and y, all combination of the matches are returned.
inner_join(tempgen, Pathwayinfo, by="Path");
## Warning: Column `Path` joining character vector and factor, coercing into
## character vector
## # A tibble: 3 x 10
## # Groups: Path [?]
## Gene D1 D2 D3 D4 D5 Path Genetype Name
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <fctr>
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding Glycolysis
## 2 Gene71 -1.05 0.84 -1.43 0.65 -1.05 P2 Noncoding Gluconeogenesis
## 3 Gene131 0.59 0.00 0.51 0.20 0.33 P3 Coding Purine metabolism
## # ... with 1 more variables: ID <fctr>
# Return all rows from x
# All columns from x and y.
# Rows in x with no match in y will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned.
left_join(tempgen, Pathwayinfo, by="Path");
## Warning: Column `Path` joining character vector and factor, coercing into
## character vector
## # A tibble: 4 x 10
## # Groups: Path [?]
## Gene D1 D2 D3 D4 D5 Path Genetype Name
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <fctr>
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding Glycolysis
## 2 Gene71 -1.05 0.84 -1.43 0.65 -1.05 P2 Noncoding Gluconeogenesis
## 3 Gene131 0.59 0.00 0.51 0.20 0.33 P3 Coding Purine metabolism
## 4 Gene171 1.44 -0.31 -0.56 -1.09 -0.76 P4 Coding <NA>
## # ... with 1 more variables: ID <fctr>
# Return all rows from y
# all columns from x and y.
# Rows in y with no match in x will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned.
right_join(tempgen, Pathwayinfo, by="Path");
## Warning: Column `Path` joining character vector and factor, coercing into
## character vector
## # A tibble: 4 x 10
## # Groups: Path [?]
## Gene D1 D2 D3 D4 D5 Path Genetype Name
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <fctr>
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding Glycolysis
## 2 Gene71 -1.05 0.84 -1.43 0.65 -1.05 P2 Noncoding Gluconeogenesis
## 3 Gene131 0.59 0.00 0.51 0.20 0.33 P3 Coding Purine metabolism
## 4 <NA> NA NA NA NA NA P5 <NA> Apotopsis
## # ... with 1 more variables: ID <fctr>
# Return all rows and all columns from both x and y.
# Where there are not matching values, returns NA for the one missing.
full_join(tempgen, Pathwayinfo, by="Path");
## Warning: Column `Path` joining character vector and factor, coercing into
## character vector
## # A tibble: 5 x 10
## # Groups: Path [?]
## Gene D1 D2 D3 D4 D5 Path Genetype Name
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <fctr>
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding Glycolysis
## 2 Gene71 -1.05 0.84 -1.43 0.65 -1.05 P2 Noncoding Gluconeogenesis
## 3 Gene131 0.59 0.00 0.51 0.20 0.33 P3 Coding Purine metabolism
## 4 Gene171 1.44 -0.31 -0.56 -1.09 -0.76 P4 Coding <NA>
## 5 <NA> NA NA NA NA NA P5 <NA> Apotopsis
## # ... with 1 more variables: ID <fctr>
# Return all rows from x where there are matching values in y, keeping just columns from x.
semi_join(tempgen, Pathwayinfo, by="Path");
## Warning: Column `Path` joining character vector and factor, coercing into
## character vector
## # A tibble: 3 x 8
## # Groups: Path [?]
## Gene D1 D2 D3 D4 D5 Path Genetype
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Gene1 0.25 0.90 -0.59 -0.48 0.10 P1 Coding
## 2 Gene71 -1.05 0.84 -1.43 0.65 -1.05 P2 Noncoding
## 3 Gene131 0.59 0.00 0.51 0.20 0.33 P3 Coding
# Return all rows from x where there are not matching values in y, keeping just columns from x.
anti_join(tempgen, Pathwayinfo, by="Path");
## Warning: Column `Path` joining character vector and factor, coercing into
## character vector
## # A tibble: 1 x 8
## # Groups: Path [?]
## Gene D1 D2 D3 D4 D5 Path Genetype
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Gene171 1.44 -0.31 -0.56 -1.09 -0.76 P4 Coding