Chapter 3 R Programming language primer
The R programming language comprises 6 major types of R objects
3.1 Vectors
The vector type is the fundamental data type in R. The elements of a vector must all have the same mode (data type) of the following type:
- Scalars are individual numbers, they actually are one-element vectors e.g.
x <- 10
- Character strings are single-element vectors of mode ”character” e.g.
y <- "hello"
<- c(1, 2, 3, 4, 5, 6)
x
x#> [1] 1 2 3 4 5 6
<- 1:6
x
x#> [1] 1 2 3 4 5 6
1]
x[#> [1] 1
<- c("Hello", "World")
text
text#> [1] "Hello" "World"
names(x) <- paste("name", 1:6, sep="")
x#> name1 name2 name3 name4 name5 name6
#> 1 2 3 4 5 6
3.1.1 Vectors in action
<- seq(-3, 3, by = .1)
z_scores
z_scores#> [1] -3.0 -2.9 -2.8 -2.7 -2.6 -2.5 -2.4 -2.3 -2.2 -2.1 -2.0 -1.9 -1.8 -1.7
#> [15] -1.6 -1.5 -1.4 -1.3 -1.2 -1.1 -1.0 -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3
#> [29] -0.2 -0.1 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 1.1
#> [43] 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5
#> [57] 2.6 2.7 2.8 2.9 3.0
<- dnorm(z_scores)
norm.dist
norm.dist#> [1] 0.004431848 0.005952532 0.007915452 0.010420935 0.013582969 0.017528300
#> [7] 0.022394530 0.028327038 0.035474593 0.043983596 0.053990967 0.065615815
#> [13] 0.078950158 0.094049077 0.110920835 0.129517596 0.149727466 0.171368592
#> [19] 0.194186055 0.217852177 0.241970725 0.266085250 0.289691553 0.312253933
#> [25] 0.333224603 0.352065327 0.368270140 0.381387815 0.391042694 0.396952547
#> [31] 0.398942280 0.396952547 0.391042694 0.381387815 0.368270140 0.352065327
#> [37] 0.333224603 0.312253933 0.289691553 0.266085250 0.241970725 0.217852177
#> [43] 0.194186055 0.171368592 0.149727466 0.129517596 0.110920835 0.094049077
#> [49] 0.078950158 0.065615815 0.053990967 0.043983596 0.035474593 0.028327038
#> [55] 0.022394530 0.017528300 0.013582969 0.010420935 0.007915452 0.005952532
#> [61] 0.004431848
mean(norm.dist)
#> [1] 0.1635609
median(norm.dist)
#> [1] 0.1295176
sd(norm.dist)
#> [1] 0.140773
3.2 Matrix
A matrix is a rectangular array of numbers (technically a vector with a number of rows and a number of columns)
<- matrix(x, nrow = 2)
m
m#> [,1] [,2] [,3]
#> [1,] 1 3 5
#> [2,] 2 4 6
<- matrix(x, nrow = 2, byrow = TRUE)
m
m#> [,1] [,2] [,3]
#> [1,] 1 2 3
#> [2,] 4 5 6
3.2.1 Matrices in action
<- matrix(rnorm(10),nrow=5,ncol=20)
normMat 1:5,1:5]
normMat[#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.4608433 -1.2710462 0.4608433 -1.2710462 0.4608433
#> [2,] -0.6669574 0.9299691 -0.6669574 0.9299691 -0.6669574
#> [3,] 0.5542039 -0.7793734 0.5542039 -0.7793734 0.5542039
#> [4,] -0.4471556 1.3831410 -0.4471556 1.3831410 -0.4471556
#> [5,] -0.8371289 -0.5374330 -0.8371289 -0.5374330 -0.8371289
rownames(normMat) <- paste("Gene", 1:5, sep="_")
1:5,1:5]
normMat[#> [,1] [,2] [,3] [,4] [,5]
#> Gene_1 0.4608433 -1.2710462 0.4608433 -1.2710462 0.4608433
#> Gene_2 -0.6669574 0.9299691 -0.6669574 0.9299691 -0.6669574
#> Gene_3 0.5542039 -0.7793734 0.5542039 -0.7793734 0.5542039
#> Gene_4 -0.4471556 1.3831410 -0.4471556 1.3831410 -0.4471556
#> Gene_5 -0.8371289 -0.5374330 -0.8371289 -0.5374330 -0.8371289
#Calculate mean for each matrix row
apply(normMat, MARGIN=1, mean)
#> Gene_1 Gene_2 Gene_3 Gene_4 Gene_5
#> -0.4051014 0.1315059 -0.1125847 0.4679927 -0.6872809
#Calculate mean for each matrix column
apply(normMat, MARGIN=2, mean)
#> [1] -0.18723893 -0.05494848 -0.18723893 -0.05494848 -0.18723893 -0.05494848
#> [7] -0.18723893 -0.05494848 -0.18723893 -0.05494848 -0.18723893 -0.05494848
#> [13] -0.18723893 -0.05494848 -0.18723893 -0.05494848 -0.18723893 -0.05494848
#> [19] -0.18723893 -0.05494848
3.3 Data Frame
A data frame is a data set that contains data of different modes (i.e. different types). A data frame is a list with each component of a list being a vector corresponding to a column in the data.
<- data.frame(cancerType = c("PDAC", "PDAC", "PDAC"),
df KRAS_status = c("wt", "mut", "mut"), Subtype = c("Classical", "Classical", "Subtype"))
df#> cancerType KRAS_status Subtype
#> 1 PDAC wt Classical
#> 2 PDAC mut Classical
#> 3 PDAC mut Subtype
3.3.1 Data Frames in action
#Extract cancerType
1]
df[,#> [1] PDAC PDAC PDAC
#> Levels: PDAC
#Extract cancerType
$cancerType
df#> [1] PDAC PDAC PDAC
#> Levels: PDAC
#Assigning new data
$Sex <- c("Male", "Female", "Male")
df
df#> cancerType KRAS_status Subtype Sex
#> 1 PDAC wt Classical Male
#> 2 PDAC mut Classical Female
#> 3 PDAC mut Subtype Male
#Excluding factors
<- data.frame(cancerType = c("PDAC", "PDAC", "PDAC"),
df KRAS_status = c("wt", "mut", "mut"), Subtype = c("Classical", "Classical", "Subtype"), stringsAsFactors = FALSE)
$cancerType
df#> [1] "PDAC" "PDAC" "PDAC"
3.4 Factors
Factors are used to represent categorical data. Factors can be ordered or unordered and are an important class for statistical analysis and for plotting.
<- factor(c("Classical", "Basal", "Basal", "Classical"))
subtype
subtype#> [1] Classical Basal Basal Classical
#> Levels: Basal Classical
levels(subtype)
#> [1] "Basal" "Classical"
table(subtype)
#> subtype
#> Basal Classical
#> 2 2
3.4.1 Factors in action
<- c("red", "green")
colours
colours[subtype]#> [1] "green" "red" "red" "green"
3.5 List
Lists can contain items comprising different data types and are similar to a C struct in C++.
<- list(1:20, "KRAS", list(1:6, 7:10))
list1
list1#> [[1]]
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#>
#> [[2]]
#> [1] "KRAS"
#>
#> [[3]]
#> [[3]][[1]]
#> [1] 1 2 3 4 5 6
#>
#> [[3]][[2]]
#> [1] 7 8 9 10
3.5.1 Lists in action
<- list(rnorm(10), rnorm(10), rnorm(10))
list2 #Access first entry of list2
1]
list2[#> [[1]]
#> [1] 0.2865250 1.6296273 -1.7738138 -0.4006902 0.0556799 -0.8535119
#> [7] -2.2770253 -0.6963454 0.3484086 0.6402198
#Access vector of values in first entry of list2
1]]
list2[[#> [1] 0.2865250 1.6296273 -1.7738138 -0.4006902 0.0556799 -0.8535119
#> [7] -2.2770253 -0.6963454 0.3484086 0.6402198
#Finding mean for each vector series in list2 - returns a list of means
lapply(list2, mean)
#> [[1]]
#> [1] -0.3040926
#>
#> [[2]]
#> [1] -0.4322401
#>
#> [[3]]
#> [1] 0.4697256
#Finding mean for each vector series in list2 - returns a vector of means
sapply(list2, mean)
#> [1] -0.3040926 -0.4322401 0.4697256
3.6 Array
Arrays are matrices with more than 2 dimensions.
<- array(c(11:14, 21:24, 31:34), dim = c(2, 2, 3))
ar
ar#> , , 1
#>
#> [,1] [,2]
#> [1,] 11 13
#> [2,] 12 14
#>
#> , , 2
#>
#> [,1] [,2]
#> [1,] 21 23
#> [2,] 22 24
#>
#> , , 3
#>
#> [,1] [,2]
#> [1,] 31 33
#> [2,] 32 34
3.7 Creating functions
#Function to select the top n most variable genes
<- function(normCounts, n=2000){
rnaSelectTopVarGenes <- apply(normCounts, 1, mad)
mad <- names(sort(mad, decreasing=TRUE))
ords <- normCounts[ords[1:n],]
select
select
}
<- rnaSelectTopVarGenes(normMat, n=2)
top2varGenes head(top2varGenes)
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> Gene_4 -0.4471556 1.383141 -0.4471556 1.383141 -0.4471556 1.383141
#> Gene_1 0.4608433 -1.271046 0.4608433 -1.271046 0.4608433 -1.271046
#> [,7] [,8] [,9] [,10] [,11] [,12]
#> Gene_4 -0.4471556 1.383141 -0.4471556 1.383141 -0.4471556 1.383141
#> Gene_1 0.4608433 -1.271046 0.4608433 -1.271046 0.4608433 -1.271046
#> [,13] [,14] [,15] [,16] [,17] [,18]
#> Gene_4 -0.4471556 1.383141 -0.4471556 1.383141 -0.4471556 1.383141
#> Gene_1 0.4608433 -1.271046 0.4608433 -1.271046 0.4608433 -1.271046
#> [,19] [,20]
#> Gene_4 -0.4471556 1.383141
#> Gene_1 0.4608433 -1.271046
3.8 Loading Data
Loading data from a comma separated file can be easily achieved using the read.delim
or read.csv
functions. There are many alternatives!
<- read.delim("course/Moffitt_gene_signature.csv",
moffitt sep=",", stringsAsFactors = FALSE)
head(moffitt)
#> symbol Group
#> 1 MUC13 Classical
#> 2 GPR35 Classical
#> 3 USH1C Classical
#> 4 BAIAP2L2 Classical
#> 5 GAL3ST1 Classical
#> 6 VIL1 Classical
class(moffitt)
#> [1] "data.frame"
$symbol
moffitt#> [1] "MUC13" "GPR35" "USH1C" "BAIAP2L2" "GAL3ST1"
#> [6] "VIL1" "DDC" "LGALS4" "TSPAN8" "SLC5A1"
#> [11] "SEMA4G" "TNFRSF11A" "HSD17B2" "CKMT1A" "DPCR1"
#> ...
#> [76] "REEP3" "DCBLD2" "NMB" "IKZF2" "GPR87"
#> [81] "MUC16" "IVL" "SPRR1B" "BCAR3" "TTC9"
#> [86] "PTGES" "CST6" "CATSPER1"
3.9 Saving Data
Saving data is also simple. You can write data to comma separated (csv) files using the write.table
function.
The best way to store objects from R is within .RData
files. .RData
files are specific to R and can store as many objects as required.
#Write data to a tab separate file called Moffitt_gene_signature.tsv
write.table(moffitt, "course/Moffitt_gene_signature.tsv",
sep="\t", quote = FALSE, row.names = FALSE)
#Save moffitt to a RData file called moffittSig.RData
save(moffitt,x,y,z,file="course/moffittSig.RData")
#Load themoffitt data.frame into the current environment
load("course/moffittSig.RData")
3.10 Saving plots
How do we save complex plots like the one below?
<- 6 - 1
df <- function(x) {
f dt(x, df)
}plot(f, -4, 4, xlab = "x-axis", ylab = "T Density dt(x)")
<- c(qt(0.025, df), qt(0.975, df))
ci <- seq(-4, ci[1], 0.01)
x <- seq(ci[2], 4, 0.01)
y polygon(c(ci[1], x, ci[1]), c(0, 0, f(x)), col = "red")
polygon(c(ci[2], y, ci[2]), c(f(y), 0, 0), col = "red")
arrows(-3, 0.25, -3, 0.05)
text(-3, 0.3, "Rejection Region")
arrows(3, 0.25, 3, 0.05)
text(3, 0.3, "Rejection Region")
arrows(0, 0.15, 0, 0.05)
text(0, 0.2, "Acceptance Region")

Figure 3.1: Acceptance and rejection regions in a t-test distribution.
It is easy to save the above plot using the pdf()
and dev.off()
functions as follows. All of the plotting commands contained between the pdf() & dev.off()
functions will be sent to file. It is also possible to use the png()
function if you would like to create png’s instead! Check out the grDevices
package for other alternatives.
pdf("course/normalDist.pdf", height=6, width=4)
<- 6 - 1
df <- function(x) {
f dt(x, df)
}plot(f, -4, 4, xlab = "x-axis", ylab = "T Density dt(x)")
<- c(qt(0.025, df), qt(0.975, df))
ci <- seq(-4, ci[1], 0.01)
x <- seq(ci[2], 4, 0.01)
y polygon(c(ci[1], x, ci[1]), c(0, 0, f(x)), col = "red")
polygon(c(ci[2], y, ci[2]), c(f(y), 0, 0), col = "red")
arrows(-3, 0.25, -3, 0.05)
text(-3, 0.3, "Rejection Region")
arrows(3, 0.25, 3, 0.05)
text(3, 0.3, "Rejection Region")
qrrows(0, 0.15, 0, 0.05)
text(0, 0.2, "Acceptance Region")
dev.off()