@@ -18,14 +18,19 @@ library(tidyverse)
18
18
# test live share
19
19
yeast.expression <- read_csv("~/Downloads/kelliher-scer-expression-data.csv")
20
20
```
21
+
22
+
23
+ ## Filter the top 1600 genes in terms of their periodic rank score (see paper for details)
21
24
``` {r}
22
25
yeast.1600 <-
23
26
filter(yeast.expression, normalized_per_rank <= 1600) |>
24
27
mutate(normalized_per_rank = NULL)
25
28
```
26
29
27
30
31
+ ## Create "long" version of expression data
28
32
33
+ Note use of ` names_transform ` argument to insure that the time data get treated as integer values instead of characters
29
34
30
35
``` {r}
31
36
yeast.long <-
@@ -40,26 +45,28 @@ str(yeast.long)
40
45
```
41
46
42
47
43
-
44
- # draw a plot showing NRM expression over time
48
+ ## Draw a plot showing expression of one gene over time
45
49
``` {r}
46
50
yeast.long |>
47
- filter(gene_ID %in% c( "NRM1","HTB2") ) |>
51
+ filter(gene_ID == "NRM1") |>
48
52
ggplot(aes(x = time, y = expression, color=gene_ID)) +
49
53
geom_point() +
50
54
geom_line()
51
55
52
56
```
53
57
54
-
58
+ ## Draw a plot showingexpression of two genes over time
55
59
``` {r}
56
60
yeast.long |>
57
- filter(gene_ID %in% c("NRM1")) |>
58
- ggplot(aes(x = expression)) +
59
- geom_histogram()
60
-
61
+ filter(gene_ID %in% c("NRM1","HTB2" )) |>
62
+ ggplot(aes(x = time, y = expression, color=gene_ID )) +
63
+ geom_point() +
64
+ geom_line()
61
65
62
66
```
67
+ ## Problem: Magnitude of gene expression very different for these two genes
68
+
69
+ Can see this by comparing mean and std dev for expression of these two genes
63
70
64
71
``` {r}
65
72
yeast.long |>
@@ -69,6 +76,9 @@ yeast.long |>
69
76
std.dev.expression = sd(expression))
70
77
```
71
78
79
+ ## Solution: Put these genes on a common scale by converting data to Z-scores (mean center, scale std dev to be 1)
80
+
81
+ ### "manual approach"
72
82
``` {r}
73
83
yeast.std <-
74
84
yeast.long |>
@@ -78,16 +88,32 @@ yeast.std <-
78
88
79
89
```
80
90
91
+ ### Or using the built-in ` scale ` function
92
+
93
+ I show both here, but generally you'd choose one or the other approach
94
+
95
+ ``` {r}
96
+ yeast.std <-
97
+ yeast.long |>
98
+ group_by(gene_ID) |>
99
+ mutate(std_expression = scale(expression))
100
+
101
+ ```
102
+
103
+
104
+ ## Replot with scaled data
81
105
82
106
83
107
``` {r}
84
108
yeast.std |>
85
- filter(gene_ID %in% c("NRM1", "HTB2")) |>
86
- group_by(gene_ID) %>%
87
- summarize(mean.expression = mean(std_expression),
88
- std.dev.expression = sd(std_expression))
109
+ filter(gene_ID %in% c("NRM1","HTB2")) |>
110
+ ggplot(aes(x = time, y = std_expression, color=gene_ID)) +
111
+ geom_point() +
112
+ geom_line()
113
+
89
114
```
90
115
116
+ ## Let's add one more gene to the mix
91
117
92
118
``` {r}
93
119
yeast.std |>
@@ -98,13 +124,13 @@ yeast.std |>
98
124
99
125
```
100
126
127
+ Or using a heat-plot representation
101
128
102
129
``` {r}
103
130
yeast.std |>
104
131
filter(gene_ID %in% c("NRM1","HTB2", "ACE2")) |>
105
132
ggplot(aes(x = time, y = gene_ID, fill=std_expression)) +
106
133
geom_tile() +
107
- #scale_fill_distiller(palette="PiYG")
108
134
scale_fill_gradient2(
109
135
low = "cyan",
110
136
mid = "black",
@@ -113,110 +139,97 @@ yeast.std |>
113
139
114
140
```
115
141
142
+ ## Create heat plot for first 100 genes in our data frame
143
+
144
+ Illustrating how unique works
145
+
116
146
``` {r}
117
147
unique(yeast.std$gene_ID)[1:100]
118
148
```
119
149
120
150
151
+ Switching from ` geom_tile ` to ` geom_raster ` because geom_raster more efficient for large heat maps (but less customizable; see docs).
152
+
153
+ Also showing how to suppress the y-axis ticks and labels
154
+
121
155
``` {r}
122
156
yeast.std |>
123
157
filter(gene_ID %in% unique(yeast.std$gene_ID)[1:100]) |>
124
158
ggplot(aes(x = time, y = gene_ID, fill=std_expression)) +
125
- geom_tile () +
126
- scale_fill_gradient2(
127
- low = "cyan ",
128
- mid = "black ",
129
- high = "yellow",
130
- midpoint = 0)
159
+ geom_raster () +
160
+ scale_fill_gradient2(low = "cyan",
161
+ mid = "black ",
162
+ high = "yellow ",
163
+ midpoint = 0) +
164
+ theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
131
165
132
166
```
133
167
134
- # Find the maximum expression of each gene
168
+ ## Reordering genes by time point of maximum expression
135
169
136
- ``` {r}
137
- max.table <-
138
- yeast.std |>
139
- group_by(gene_ID) %>%
140
- summarize(max.expression = max(std_expression),
141
- max.index = which.max(std_expression))
142
170
143
- max.table
144
- ```
171
+ To find the maximum expression of each gene we could use the ` max ` function
145
172
146
- # apply which.max to our data frame as a whole
147
173
``` {r}
148
174
yeast.std |>
149
- group_by(gene_ID) %>%
150
- mutate(max.index = which.max(std_expression))
151
-
175
+ group_by(gene_ID) |>
176
+ summarize(max.expression = max(std_expression))
152
177
```
153
178
154
- # use which.max to order our data frame
179
+ The ` which.max ` function tells us the index at which the maximum expression occurs
180
+
155
181
``` {r}
156
182
yeast.std |>
157
- group_by(gene_ID) %>%
158
- mutate(max.index = which.max(std_expression)) %>%
159
- pull(max.index)
160
- mutate(gene_ID2 = fct_reorder(gene))
161
-
162
-
183
+ group_by(gene_ID) |>
184
+ summarize(max.expression = max(std_expression),
185
+ max.index = which.max(std_expression))
163
186
```
164
187
165
-
166
-
188
+ We can use this information to sort gene by their time point of maximum expression. First we sort the gene names by their index of maximum expression
167
189
168
190
``` {r}
169
- max.table
170
- ```
171
- ``` {r}
172
- df <- tibble::tribble(
173
- ~color, ~a, ~b,
174
- "blue", 1, 2,
175
- "green", 6, 2,
176
- "purple", 3, 3,
177
- "red", 2, 3,
178
- "yellow", 5, 1
179
- )
180
- fct_reorder(df$color, df$a, min)
181
-
182
- ```
183
-
184
-
191
+ genes.by.which.max <-
192
+ yeast.std |>
193
+ group_by(gene_ID) |>
194
+ mutate(max.index = which.max(std_expression)) |>
195
+ arrange(max.index) |>
196
+ pull(gene_ID) |>
197
+ unique()
185
198
186
- ``` {r}
187
- levels(fct_reorder(max.table$gene_ID, max.table$max.index, min) )
199
+ # show the first ten genes sorted by index of max expession
200
+ head(genes.by.which.max, n = 10 )
188
201
```
189
202
203
+ Then we use the ` fct_relevel ` function to create a new ordering of the ` gene_ID ` column.
190
204
191
205
``` {r}
192
- yeast.std
206
+ reordered.gene_ID <- fct_relevel(yeast.std$gene_ID, genes.by.which.max)
207
+ yeast.std$gene_ID <- reordered.gene_ID
193
208
```
194
209
195
210
211
+ Genes will no longer be shown in alphabetical order but using the order specified by ` genes.by.which.max ` vector.
196
212
213
+ The figure below shows not only the ordered genes, but illustrates a number of other tweaks including:
197
214
215
+ * how to change the figure height and width in the code block header
216
+ * how to set limits on a color scale
217
+ * how to "squash" or compress data to fit in those limits (` oob ` argument)
218
+ * how to reverse a discrete axis (` scale_y_discrete(limits=rev) ` ).
198
219
199
-
200
- ``` {r}
201
- sorted.yeast <-
202
- yeast.std |>
203
- group_by(gene_ID)
204
-
205
- sorted.yeast$gene_ID =
206
-
207
-
208
- ```
209
-
210
- ``` {r}
211
- sorted.yeast |>
212
- filter(gene_ID %in% unique(sorted.yeast$gene_ID)[1:100]) |>
220
+ ``` {r, fig.width=3, fig.height=6}
221
+ yeast.std |>
213
222
ggplot(aes(x = time, y = gene_ID, fill=std_expression)) +
214
- geom_tile() +
215
- scale_fill_gradient2(
216
- low = "cyan",
217
- mid = "black",
218
- high = "yellow",
219
- midpoint = 0)
223
+ geom_raster() +
224
+ scale_fill_gradient2(low = "cyan",
225
+ mid = "black",
226
+ high = "yellow",
227
+ midpoint = 0,
228
+ limits=c(-2,2),
229
+ oob = scales::squish) +
230
+ theme(axis.text.y = element_blank(),
231
+ axis.ticks.y = element_blank()) +
232
+ scale_y_discrete(limits=rev)
220
233
221
234
```
222
235
0 commit comments