Plotting with base R

2017-11-13 20_49_03-Plot Zoom.png
The end result 

Copy and paste the following code to your R Studio platform or R version:

# Making a plot in R using Base R
head(mtcars)
plot(mtcars$wt, mtcars$mpg)
abline(lm(mpg ~ wt, data = mtcars))

plot(mtcars$wt, mtcars$mpg, main = “Miles Per Gallon and Weight”,
xlab = “Weight(1000lbs)”, ylab = “Miles per gallon”,
pch = “*”, col = “blue”, col.main = “royalblue2”,
col.lab = “brown”, col.axis = “black”)

# You cans see all the available colours by typing the following command
colors()

# Increasing the size of the plot happens with the cex argument inserted
plot(mtcars$wt, mtcars$mpg, main = “Miles Per Gallon and Weight”,
xlab = “Weight(1000lbs)”, ylab = “Miles per gallon”,
pch = “o”, col = “blue”, col.main = “royalblue2”,
col.lab = “brown”, col.axis = “black”,
cex = 2, cex.axis = 1.5, cex.lab = 1.5)

# If you want to add text, you add it once you plot the relationship
plot(mtcars$wt, mtcars$mpg, main = “Miles Per Gallon and Weight”,
xlab = “Weight(1000lbs)”, ylab = “Miles per gallon”,
pch = “o”, col = “blue”, col.main = “royalblue2”,
col.lab = “brown”, col.axis = “black”,
cex = 2, cex.axis = 1.5, cex.lab = 1.5)

text(mtcars$wt, mtcars$mpg + 1,
labels = rownames(mtcars), cex = 0.7)
text(4.5, 30, labels = “Car Makes”, cex = 4, col = “red”)

# mtext refers to the text in the margin of the plot-side 1 is the x axis
plot(mtcars$wt, mtcars$mpg, main = “Miles Per Gallon and Weight”,
xlab = “Weight(1000lbs)”, ylab = “Miles per gallon”,
pch = “o”, col = “blue”, col.main = “royalblue2”,
col.lab = “brown”, col.axis = “black”,
cex = 2, cex.axis = 1.5, cex.lab = 1.5)

text(mtcars$wt, mtcars$mpg + 1,
labels = rownames(mtcars), cex = 0.7)
text(4.5, 30, labels = “Car Makes”, cex = 4, col = “red”)
mtext(“Assume that you have another axis here (dual)”,
side = 4, cex = 1.5, col = “magenta1”)

# Dual Axis plots
set.seed(1001)
year <- 1901:1920
population <- c(1:20)
gdp <- sample(40:50, 20, replace = TRUE)

year
population
gdp

plot.new()
par(mar = c(5, 4, 4, 5) + 0.1)
plot(year, population, type = “b”, ylim = c(0, 21),
lwd = 2, main = “Multiple Y Axes”, col = “green3”,
pch = 16)
# By assigning this par(new = TRUE) we do not overwrite the previous plot
# Merely, we continue adding to it
par(new = TRUE)
plot(year, gdp, type = “b”, axes = FALSE, xlab = “”, ylab = “”,
ylim = c(30, 60), lwd = 2, main = “Multiple Y Axes”, col = “red”,
pch = 15)
axis(side = 4, ylim = c(30, 60)) # draw the second axis like this
mtext(“gdp”, side = 4, line = 3)
legend(“bottomright”, inset = 0.05, col = c(“green3”, “red”),
lty = 1, legend = c(“Pop”, “GDP”))

Advertisements

Second Weekend straight, obsessed with R — Part IV

Copy and paste the following code to your R Studio platform or R version:

# Matrices and Frequency Tables
library(datasets)
object.size(mtcars)
mt.cars.mat <- as.matrix(mtcars)
object.size(mt.cars.mat)
# This is the second most important difference between a matrix and a data frame (size)
# The first one is that a data frame can contain columns of different types

# Create a matrix from scratch
m1 <- matrix(1:100, ncol = 5)
head(m1)
class(m1)

# Notice the difference
m1 <- matrix(1:100, ncol = 5, byrow = TRUE)
head(m1)

dim(m1)

# Subsetting is the same as in the data frames
m1[ ,c(1,2)]
m1[1:5, c(1,2,3)]

# Getting the diagonal elements
diag(m1)

# Nice functionalities with aggregate functions
# (they are avaialable in data frames as well)
rowSums(m1)
rowMeans(m1)
colSums(m1)
colMeans(m1)
cumsum(m1)

head(mtcars)
# Count the different frequencies of the cyl column
table(mtcars$cyl)
tb <- table(mtcars$cyl, mtcars$vs)
tb
class(tb)

# In this way, we can see the gear per cyl
# The first argument goes into the columns of the resulted table
table(mtcars$cyl, mtcars$gear)
# Let’s automate the solution
tb <- table(mtcars$cyl, mtcars$gear)
result <- as.data.frame.matrix(tb)
result

# Suppose we search for the intersection of 6 cylinders and 4 gears
result[rownames(result) == “6”, “4”]

# Merging data frames
set.seed(100)
df1 <- data.frame(FruitId = c(1:10),
Subject = sample(c(“Apple”, “Banana”, “Mongo”), 10, replace = TRUE))
df1

df2 <- data.frame(FruitNum = c(2,4,6,12),
Cuisine = sample(c(“Chinese”, “Mexican”, “Italian”), 4, replace = TRUE))
df2

merge(df1, df2, by.x = “FruitId”, by.y = “FruitNum”)

# By default, we get the rows that are common (inner join)
# there are other types of join, like outer, right, or left,
# where we keep all rows, all rows from the right data set
# all rows from the left data set, respectively:

merge(df1, df2, by.x = “FruitId”, by.y = “FruitNum”, all = TRUE)
merge(df1, df2, by.x = “FruitId”, by.y = “FruitNum”, all.y = TRUE)
merge(df1, df2, by.x = “FruitId”, by.y = “FruitNum”, all.x = TRUE)

Second Weekend straight, obsessed with R — Part III

Copy and paste the following code to your R Studio platform or R version:

# ——— Data Frames and Matrices ———————- #
# In general, data frames are used to store 2-dimensional data
set.seed(100)
a <- c(1:17, NA)
a
b <- c(“a”, “b”, “c”, “d”, NA, “f”, “a”, “b”, “c”, “d”, NA, “f”,
“a”, “b”, “c”, “d”, NA, “f”)
b
c <- factor(sample(c(“red”, “blue”, “green”), 18, replace = TRUE))
c

df1 <- data.frame(v1 = a, v2 = b, v3 = c)
df1
typeof(df1)
class(df1)

dim(df1)
nrow(df1)
ncol(df1)
colnames(df1)
colnames(df1) <- c(“A”, “B”, “C”)
colnames(df1)

head(df1)
tail(df1, 3)
View(df1)

# It has now been converted into a matrix
mat <- t(df1)
mat
class(mat)

summary(df1)
str(df1)

df1$B <- as.character(df1$B) # or
# df1$B <- as.character(df1[ ,2])
# df1$B <- as.character(df1[ ,”B”])
str(df1)

# Selecting multiple columns
df1[ ,c(1,2)] # or
df1[ ,c(“A”,”C”)]

# Selecting multiple rows
df1[c(1:5), c(1,2)]

df1[which(df1$A >= 12), ] # or

index <- which(df1$A > 8)
index
df1[index, ] # or

df1[df1$A > 9, ] # or
subset(df1, select = c(1,2), subset = A >= 10)

# Eliminating missing values
df2 <- na.omit(df1)
df2
df1

index <- !is.na(df1$A)
index
# now, the first column is without NA’s
df1[index, ] # or
df1[!is.na(df1$A), ]

# ———— A mini challenge —————– #
df1 <- data.frame(a, b, c, stringsAsFactors = TRUE)
df1
str(df1)

# Now, remove all missing values and contain only those rows that
# contain either “green” or “blue” in column C
index <- df1$c %in% c(“green”, “blue”)
index
df2 <- df1[index, ]
df2
df2 <- na.omit(df2)
df2

Let’s spend the weekend to consolidate our skills in R — Part II

2017-11-05 21_03_42-Lists _ StackSkills.png
Understanding the lists – a nice visualization

Copy and paste the following code to your R Studio platform or R version:

# ——————– R Essentials ——————————–#
# Lists
set.seed(100)
x <- letters # is a constant built-in function in R
x

y <- 1:26
y

z <- round(runif(26, 1, 26))
z

# Having the 3 variables in the same object and preserving their type
m <- list(x,y,z)
m
class(m)

m[1]
class(m[1])
class(m[[1]])
m[[1]][4]

# Or
m1 <- unlist(m)
m1

m1[4]
# however, now the numbers are also characters

# Let’s go a level deeper
n <- list(x,m)
n
# Getting the letter d now, seems a bit challenging
n[[2]][[1]][4]

# where is the letter r present?
m
m[[1]]
m[[2]]
m[[2]][m[[1]] == “r”]

# Set operations
x <- c(1:7)
y <- c(4:10)

y[x]
x[y]

out <- x %in% y
out

# These are the numbers present in y
x[out]

a <- c(x,y)
a
length(a)
# we have some duplicates in the a vector
length(unique(a))

# we could have combined the two by other ways
union(x,y)
intersect(x,y)

# Remove all items from y that are present in x
x
y
setdiff(y,x)

# ——————————————————– #
# Find the items that are not common between these two (x,y)
x
y
setdiff(union(x,y), intersect(x,y))

# Sampling and Sorting
a <- 1:100
a

# Let’s pick 10 random numbers as a sample from the a
set.seed(100)
# set the seed so the results are reproducible
sample(a, 10)

# What if we want to allow replacements?
b <- sample(a, 30, replace = TRUE)
b

sort(b)
sort(b, decreasing = TRUE)

b
o <- order(b) #it gives the position of the original items in the increasing order
b[o]

# or
rev(b[o])
# which is the same as
b[order(-b)]

# Check conditions
a <- 2
b <- 5

if(a < b) {

print(“Less”)

} else {

print(“More”)

}

a <- 1:10
ifelse(a%%2 == 0, “Even”, “Odd”)
ifelse(sum(a) < 5, x <- “The sum is less than 5”, x <- “The sum is more than 5”)
x

# Challenge: Create a character vector of same length as a that has yes in positions
# where the number is a multiple of three and no otherwise

a <- round(runif(15, 1, 100))
char <- ifelse(a%%3 == 0, “Yes”, “No”)
char

# For Loops
for(i in 1:10){

print(i)

}

# Skip a loop
for(i in 1:10){
if(i == 5){
next
}

print(i)

}

# Break out of the loop
for(i in 1:10){
if(i == 5){
break
}

print(i)

}

Let’s spend the weekend to consolidate our skills in R — Part I

2017-11-05 21_04_15-RStudio.png
Inside the R Studio

Copy and paste the following code to your R Studio platform or R version:

# ————————- Introductory Code in R ————————– #
print(‘Hello World!’)
5 / 2 * 3 # observe the order of the operations
10 %% 3 # the remainder
10 %/% 3 # the quotient
?print
??print

getwd()

install.packages(“devtools”)
library(devtools)

# Working with vectors
a <- “abc”
a
A # since R is case sensitive

class(a)
a <- 13
class(a)

a <- as.character(a)
class(a)

# Other types are integers, logicals and factors
# Data structures are a bit different. In R, we have vectors,
# matrices, data frames and lists. Vectors and matrices are
# homogeneous, whereas data frames and lists are heterogeneous

# Initializing a numeric vector
a <- numeric(10)
a

# Initializing a character vector
b <- character(10)
b

a[10] <- 10
a[1] <- 1
a
# Speeding up this assignment takes the following form:

a <- c(1,2,3,4,5,6,7,8,9,10)
a
# Even faster
a <- 10:20
a
class(a)

# Other ways of creating a vector are the following:
c <- rep(1,5)
c
d <- rep(c(1,2,3), 3)
d

e <- seq(1, 10, by = 2)
e

f <- seq(1, 10, length = 20)
f

g <- seq(1, by = 2, length = 10)
g

g[5]

# Everything except the 5th number
g[-5]

# If you want multiple values from the vector
g[c(5,6)]

# Factors
fac <- factor(c(“red”, “blue”, “white”, “red”, “blue”))
fac
class(fac)
levels(fac)
levels(fac) <- c(“purple”, “black”, “orange”)
fac

length(fac)
length(a)
a
names(a) <- c(“Ten”, “Eleven”, “Twelve”, “Thirteen”, “Fourteen”, “Fifteen”,
“Sixteen”, “Seventeen”, “Eighteen”, “Nineteen”, “Twenty”)
a
names(a)

# Another example
# Try grabbing the first and second to last item from the d vector

a <- 1:10
d <- c(a, a * 2, a * 3)
d
length(d)

# First solution
d[1]
d[length(d)-1]
# Also, consolidating:
d[c(1, length(d)-1)]

# Random Number, Rounding and Binning
set.seed(10)
# for generating the same random numbers everytime we run the code
a <- runif(10, 100, 150)
a

# Obtaining only the integer part of the values
b <- trunc(a)
b
b <- floor(a)
b

# Rounding to the nearest integer
c <- ceiling(a)
c

d <- round(a,1)
d

# Let us bin the vector
bins <- c(100, 110, 120, 130, 140, 150)
bins

d1 <- cut(d, bins)
d1

d2 <- pretty(d, 5) # it produces the bins variable that we have created manually
d2

# Let’s combine the methods
d3 <- cut(d, pretty(d,5))
d3

set.seed(100)
rNumbers <- runif(25, 1, 10)
rNumbers

rNumbers <- trunc(rNumbers)
rNumbers

# Dealing with missing values
a <- 21:30
a

a[9] <- NA
a

anyNA(a)
is.na(a)
# a == NA does not work

!is.na(a)
a[!is.na(a)]

# Replacing the NA with the mean
a[is.na(a)] <- mean(a, na.rm = TRUE)
a

# The which operator
set.seed(100)
a <- round(runif(25, 1, 100))
a

# Finding which numbers are greater than 25
pos <- which(a > 25)
pos
a[pos]

# Finding which numbers are multiples of 3 and 4
pos <- which(a%%3 == 0 & a%%4 == 0)
pos
a[pos] # only 36 is a multiple of both

# Let’s relax the condition a bit
pos <- which(a%%3 == 0 | a%%4 == 0)
pos
a[pos]

# The above examples will work without the which function as well
a[a%%3 == 0 & a%%4 == 0]
a[a%%3 == 0 | a%%4 == 0]
# In this case, instead of position, we are passing a logical vector,
# indicating which values to include:
a%%3 == 0 & a%%4 == 0

# Now, getting the even numbers from the vector a seems quite intuitive
a[a%%2 == 0]

Functions in R (a bit deeper)

Copy and paste the following code to your R Studio platform or R version:

# Functions
hypotenuse <- function(side1, side2){
return(sqrt(side1^2 + side2^2))
}

hypotenuse(10, 10)
hypotenuse(side1 = 10, side2 = 25)
# if we do not assign a value to a parameter, we get an error
hypotenuse(25)
# we can deal with that, by assigning default values
hypotenuse <- function(side1 = 10, side2 = 10){
return(sqrt(side1^2 + side2^2))
}

hypotenuse()
hypotenuse(side1 = 5)

# Best practice though is demonstrated like this
hypotenuse <- function(side1 = NULL, side2 = NULL){
if(any(is.null(side1), is.null(side2))){
stop(“One of the sides is missing”)
}
return(sqrt(side1^2 + side2^2))
}

hypotenuse(12, 12)
hypotenuse(12)

# Or
hypotenuse <- function(side1, side2){
if(missing(side1) |missing(side2)){
stop(“One of the sides is missing”)
}
return(sqrt(side1^2 + side2^2))
}

hypotenuse(12, 12)
hypotenuse(12)

# Alternatively
hypotenuse <- function(side1 = NULL, side2 = NULL){
stopifnot(!missing(side1), !missing(side2))
return(sqrt(side1^2 + side2^2))
}

hypotenuse(12, 12)
hypotenuse(12)

# Optional Arguments
hypotenuse <- function(side1, side2, round = FALSE, …){
if(missing(side1) |missing(side2)){
stop(“One of the sides is missing”)
}
if(round){
return(round(sqrt(side1^2 + side2^2), …))
} else {
return(sqrt(side1^2 + side2^2))
}
}

hypotenuse(side1 = 12, side2 = 12)
hypotenuse(side1 = 12, side2 = 12, round = TRUE)
hypotenuse(12, 12, TRUE)

# The triple dots (…) in the function allow for functionalities like the following:
hypotenuse(side1 = 12, side2 = 12, round = TRUE, digits = 2)

# In R, simple mathematical notations are also functions and can be used like below:
‘+'(2,3)
‘*'(2,5)
# Of course, it is quite rare to use it like this, but this merely demonstrates that
# we can use the definition of a function as a symbol for further use, like below:
‘%***%’ <- function(x,y){
return(sqrt(x^2 + y^2))
}
10%***%2
hypotenuse(10,2)
# I believe that this is almost mind-blowing!!!!

# ———- A Mini Challenge ———— #
# Return the minimum of two numbers with your own function

minimum <- function(a = NULL, b = NULL){
if(any(is.null(a), is.null(b))){
stop(“One of the numbers is missing”)
}
if(a < b){
return(a)
}else if(b < a){
return(b)
}else{
return(“The values are equal”)
}
}

minimum()
minimum(2,2)
minimum(3,10)
minimum(25,20)

Dates and Strings in R

Copy and paste the following code to your R Studio platform or R version:

# Dates
# install.packages(“lubridate”)
library(lubridate)
Sys.time()
class(Sys.time())

d1 <- “2014-11-28”
d1
d1 <- ymd(d1)
class(d1)
d1 <- ymd(d1, tz = “UTC”)
class(d1)
day(d1)
month(d1)
week(d1)
weekdays(d1)

d2 <- “2015-12-13”
d2 <- ymd(d2, tz = “UTC”)
d <- d2 – d1
d
difftime(d2, d1, unit = “week”)
difftime(d2, d1, unit = “hours”)

# String Manipulations
paste(“txt1”, “txt2”)
paste(“txt1”, “txt2”, “txt3”, sep = “——“)

paste(c(“a”, “b”), c(“c”, “d”), sep = “–“)
paste(c(“a”, “b”), c(“c”, “d”), sep = “–“, collapse = “&”)

# Common string operations
# install.packages(“stringr”)
library(stringr)
a <- “The Lord of the Rings and Harry Potter are the most fascinating tales ever told in English. 12.3.”
b <- ” @r_programming ”

str_count(b)
str_count(a)

str_to_upper(a)
str_to_title(a)

# Split a string into sentences or words
str_split(a, ” “)
unlist(str_split(a, ” “))

substr(b, 2, str_count(b))

# Grab the words from a sentence
word(a, c(2:5))

str_detect(b, “[:punct:]”)
str_detect(b, “@”)
str_detect(b, “@”)

# We can replace characters in a string
str_replace_all(a, “t”, “L”)
# Here, for instance, we replace all lower case t with upper case L
# Or
str_replace_all(b, “[:punct:]”, “”)
# Or
str_replace_all(a, “[0-9]”, “Hi!”)

# Let’s trim some whitespace
str_trim(b, side = “both”)

# —————————- A Mini Challenge —————————— #
# Extract the names of the two books from the string a and put it in title case
a <- “The Lord of the Rings and Harry Potter are the most fascinating tales ever told in English”
y <- word(a, c(1:8))
y

LOTR <- paste(y[1:5])
LOTR
LOTR <- paste(y[1:5], collapse = ” “)
LOTR
str_to_title(LOTR)

HR <- paste(y[7:8], collapse = ” “)
HR
str_to_title(HR)