Skip to content

Commit 2887f1b

Browse files
committed
Oct 19, 2022, 10:13 PM
1 parent 81d61ea commit 2887f1b

File tree

1 file changed

+93
-80
lines changed

1 file changed

+93
-80
lines changed

lecture-08-inclass.Rmd

+93-80
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,19 @@ library(tidyverse)
1818
# test live share
1919
yeast.expression <- read_csv("~/Downloads/kelliher-scer-expression-data.csv")
2020
```
21+
22+
23+
## Filter the top 1600 genes in terms of their periodic rank score (see paper for details)
2124
```{r}
2225
yeast.1600 <-
2326
filter(yeast.expression, normalized_per_rank <= 1600) |>
2427
mutate(normalized_per_rank = NULL)
2528
```
2629

2730

31+
## Create "long" version of expression data
2832

33+
Note use of `names_transform` argument to insure that the time data get treated as integer values instead of characters
2934

3035
```{r}
3136
yeast.long <-
@@ -40,26 +45,28 @@ str(yeast.long)
4045
```
4146

4247

43-
44-
# draw a plot showing NRM expression over time
48+
## Draw a plot showing expression of one gene over time
4549
```{r}
4650
yeast.long |>
47-
filter(gene_ID %in% c("NRM1","HTB2")) |>
51+
filter(gene_ID == "NRM1") |>
4852
ggplot(aes(x = time, y = expression, color=gene_ID)) +
4953
geom_point() +
5054
geom_line()
5155
5256
```
5357

54-
58+
## Draw a plot showingexpression of two genes over time
5559
```{r}
5660
yeast.long |>
57-
filter(gene_ID %in% c("NRM1")) |>
58-
ggplot(aes(x = expression)) +
59-
geom_histogram()
60-
61+
filter(gene_ID %in% c("NRM1","HTB2")) |>
62+
ggplot(aes(x = time, y = expression, color=gene_ID)) +
63+
geom_point() +
64+
geom_line()
6165
6266
```
67+
## Problem: Magnitude of gene expression very different for these two genes
68+
69+
Can see this by comparing mean and std dev for expression of these two genes
6370

6471
```{r}
6572
yeast.long |>
@@ -69,6 +76,9 @@ yeast.long |>
6976
std.dev.expression = sd(expression))
7077
```
7178

79+
## Solution: Put these genes on a common scale by converting data to Z-scores (mean center, scale std dev to be 1)
80+
81+
### "manual approach"
7282
```{r}
7383
yeast.std <-
7484
yeast.long |>
@@ -78,16 +88,32 @@ yeast.std <-
7888
7989
```
8090

91+
### Or using the built-in `scale` function
92+
93+
I show both here, but generally you'd choose one or the other approach
94+
95+
```{r}
96+
yeast.std <-
97+
yeast.long |>
98+
group_by(gene_ID) |>
99+
mutate(std_expression = scale(expression))
100+
101+
```
102+
103+
104+
## Replot with scaled data
81105

82106

83107
```{r}
84108
yeast.std |>
85-
filter(gene_ID %in% c("NRM1", "HTB2")) |>
86-
group_by(gene_ID) %>%
87-
summarize(mean.expression = mean(std_expression),
88-
std.dev.expression = sd(std_expression))
109+
filter(gene_ID %in% c("NRM1","HTB2")) |>
110+
ggplot(aes(x = time, y = std_expression, color=gene_ID)) +
111+
geom_point() +
112+
geom_line()
113+
89114
```
90115

116+
## Let's add one more gene to the mix
91117

92118
```{r}
93119
yeast.std |>
@@ -98,13 +124,13 @@ yeast.std |>
98124
99125
```
100126

127+
Or using a heat-plot representation
101128

102129
```{r}
103130
yeast.std |>
104131
filter(gene_ID %in% c("NRM1","HTB2", "ACE2")) |>
105132
ggplot(aes(x = time, y = gene_ID, fill=std_expression)) +
106133
geom_tile() +
107-
#scale_fill_distiller(palette="PiYG")
108134
scale_fill_gradient2(
109135
low = "cyan",
110136
mid = "black",
@@ -113,110 +139,97 @@ yeast.std |>
113139
114140
```
115141

142+
## Create heat plot for first 100 genes in our data frame
143+
144+
Illustrating how unique works
145+
116146
```{r}
117147
unique(yeast.std$gene_ID)[1:100]
118148
```
119149

120150

151+
Switching from `geom_tile` to `geom_raster` because geom_raster more efficient for large heat maps (but less customizable; see docs).
152+
153+
Also showing how to suppress the y-axis ticks and labels
154+
121155
```{r}
122156
yeast.std |>
123157
filter(gene_ID %in% unique(yeast.std$gene_ID)[1:100]) |>
124158
ggplot(aes(x = time, y = gene_ID, fill=std_expression)) +
125-
geom_tile() +
126-
scale_fill_gradient2(
127-
low = "cyan",
128-
mid = "black",
129-
high = "yellow",
130-
midpoint = 0)
159+
geom_raster() +
160+
scale_fill_gradient2(low = "cyan",
161+
mid = "black",
162+
high = "yellow",
163+
midpoint = 0) +
164+
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
131165
132166
```
133167

134-
# Find the maximum expression of each gene
168+
## Reordering genes by time point of maximum expression
135169

136-
```{r}
137-
max.table <-
138-
yeast.std |>
139-
group_by(gene_ID) %>%
140-
summarize(max.expression = max(std_expression),
141-
max.index = which.max(std_expression))
142170

143-
max.table
144-
```
171+
To find the maximum expression of each gene we could use the `max` function
145172

146-
# apply which.max to our data frame as a whole
147173
```{r}
148174
yeast.std |>
149-
group_by(gene_ID) %>%
150-
mutate(max.index = which.max(std_expression))
151-
175+
group_by(gene_ID) |>
176+
summarize(max.expression = max(std_expression))
152177
```
153178

154-
# use which.max to order our data frame
179+
The `which.max` function tells us the index at which the maximum expression occurs
180+
155181
```{r}
156182
yeast.std |>
157-
group_by(gene_ID) %>%
158-
mutate(max.index = which.max(std_expression)) %>%
159-
pull(max.index)
160-
mutate(gene_ID2 = fct_reorder(gene))
161-
162-
183+
group_by(gene_ID) |>
184+
summarize(max.expression = max(std_expression),
185+
max.index = which.max(std_expression))
163186
```
164187

165-
166-
188+
We can use this information to sort gene by their time point of maximum expression. First we sort the gene names by their index of maximum expression
167189

168190
```{r}
169-
max.table
170-
```
171-
```{r}
172-
df <- tibble::tribble(
173-
~color, ~a, ~b,
174-
"blue", 1, 2,
175-
"green", 6, 2,
176-
"purple", 3, 3,
177-
"red", 2, 3,
178-
"yellow", 5, 1
179-
)
180-
fct_reorder(df$color, df$a, min)
181-
182-
```
183-
184-
191+
genes.by.which.max <-
192+
yeast.std |>
193+
group_by(gene_ID) |>
194+
mutate(max.index = which.max(std_expression)) |>
195+
arrange(max.index) |>
196+
pull(gene_ID) |>
197+
unique()
185198
186-
```{r}
187-
levels(fct_reorder(max.table$gene_ID, max.table$max.index, min))
199+
# show the first ten genes sorted by index of max expession
200+
head(genes.by.which.max, n = 10)
188201
```
189202

203+
Then we use the `fct_relevel` function to create a new ordering of the `gene_ID` column.
190204

191205
```{r}
192-
yeast.std
206+
reordered.gene_ID <- fct_relevel(yeast.std$gene_ID, genes.by.which.max)
207+
yeast.std$gene_ID <- reordered.gene_ID
193208
```
194209

195210

211+
Genes will no longer be shown in alphabetical order but using the order specified by `genes.by.which.max` vector.
196212

213+
The figure below shows not only the ordered genes, but illustrates a number of other tweaks including:
197214

215+
* how to change the figure height and width in the code block header
216+
* how to set limits on a color scale
217+
* how to "squash" or compress data to fit in those limits (`oob` argument)
218+
* how to reverse a discrete axis (`scale_y_discrete(limits=rev)`).
198219

199-
200-
```{r}
201-
sorted.yeast <-
202-
yeast.std |>
203-
group_by(gene_ID)
204-
205-
sorted.yeast$gene_ID =
206-
207-
208-
```
209-
210-
```{r}
211-
sorted.yeast |>
212-
filter(gene_ID %in% unique(sorted.yeast$gene_ID)[1:100]) |>
220+
```{r, fig.width=3, fig.height=6}
221+
yeast.std |>
213222
ggplot(aes(x = time, y = gene_ID, fill=std_expression)) +
214-
geom_tile() +
215-
scale_fill_gradient2(
216-
low = "cyan",
217-
mid = "black",
218-
high = "yellow",
219-
midpoint = 0)
223+
geom_raster() +
224+
scale_fill_gradient2(low = "cyan",
225+
mid = "black",
226+
high = "yellow",
227+
midpoint = 0,
228+
limits=c(-2,2),
229+
oob = scales::squish) +
230+
theme(axis.text.y = element_blank(),
231+
axis.ticks.y = element_blank()) +
232+
scale_y_discrete(limits=rev)
220233
221234
```
222235

0 commit comments

Comments
 (0)